diff --git a/Flax.flaxproj b/Flax.flaxproj index aa3a8655f..ee5ab4547 100644 --- a/Flax.flaxproj +++ b/Flax.flaxproj @@ -4,7 +4,7 @@ "Major": 1, "Minor": 12, "Revision": 0, - "Build": 6901 + "Build": 6902 }, "Company": "Flax", "Copyright": "Copyright (c) 2012-2026 Wojciech Figat. All rights reserved.", diff --git a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs index 954599347..9844f3fda 100644 --- a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs +++ b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs @@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated settingsButton.Tag = script; settingsButton.Clicked += OnSettingsButtonClicked; - group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2); + // Adjust margin to not overlap with other ui elements in the header + group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin }; group.Object(values, editor); // Remove drop down arrows and containment lines if no objects in the group if (group.Children.Count == 0) diff --git a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs index b3fff5644..28593a7f5 100644 --- a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs +++ b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs @@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors protected bool NotNullItems; private IntValueBox _sizeBox; + private Label _label; private Color _background; private int _elementsCount, _minCount, _maxCount; private bool _readOnly; @@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors Parent = dropPanel, }; - var label = new Label + _label = new Label { Text = "Size", AnchorPreset = AnchorPresets.TopRight, @@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors Resize(Count + 1); }; } - } + Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged; + } + private void OnSetupContextMenu(ContextMenu menu, DropPanel panel) { if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal))) @@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors }); } + private void OnLayoutSizeChanged(Control control) + { + if (Layout.ContainerControl is DropPanel dropPanel) + { + // Hide "Size" text when array editor title overlaps + var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText); + if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left) + _label.TextColor = _label.TextColorHighlighted = Color.Transparent; + else + _label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground; + } + } + /// protected override void Deinitialize() { _sizeBox = null; + Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged; base.Deinitialize(); } diff --git a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs index 64bc9080b..055c6a29d 100644 --- a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs +++ b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs @@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements { var style = Style.Current; var settingsButtonSize = Panel.HeaderHeight; - return new Image + Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin }; +; return new Image { TooltipText = "Settings", AutoFocus = true, diff --git a/Source/Editor/GUI/Input/ValueBox.cs b/Source/Editor/GUI/Input/ValueBox.cs index 674ee0697..88ec9a4ee 100644 --- a/Source/Editor/GUI/Input/ValueBox.cs +++ b/Source/Editor/GUI/Input/ValueBox.cs @@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input /// public event Action SlidingEnd; + /// + /// If enabled, pressing the arrow up or down key increments/ decrements the value. + /// + public bool ArrowKeysIncrement = true; + /// /// Gets or sets the slider speed. Use value 0 to disable and hide slider UI. /// @@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input ResetViewOffset(); } + /// + public override bool OnKeyDown(KeyboardKeys key) + { + if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown)) + { + bool altDown = Root.GetKey(KeyboardKeys.Alt); + bool shiftDown = Root.GetKey(KeyboardKeys.Shift); + bool controlDown = Root.GetKey(KeyboardKeys.Control); + float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f)); + float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue; + + _startSlideValue = Value; + ApplySliding(slideDelta); + EndSliding(); + Focus(); + return true; + } + + return base.OnKeyDown(key); + } + /// public override bool OnMouseDown(Float2 location, MouseButton button) { diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs index 66e83a6e9..7cd7e7fef 100644 --- a/Source/Editor/Modules/UIModule.cs +++ b/Source/Editor/Modules/UIModule.cs @@ -133,6 +133,7 @@ namespace FlaxEditor.Modules private ContextMenuButton _menuToolsProfilerWindow; private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault; private ContextMenuButton _menuToolsTakeScreenshot; + private ContextMenuButton _menuToolsOpenLocalFolder; private ContextMenuChildMenu _menuWindowApplyWindowLayout; private ToolStripButton _toolStripSaveAll; @@ -754,6 +755,16 @@ namespace FlaxEditor.Modules _menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot); cm.AddSeparator(); cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show()); + cm.AddSeparator(); + var childMenu = cm.AddChildMenu("Open Product Local folder"); + childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder)); + _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () => + { + string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData); + GameSettings settings = GameSettings.Load(); + string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName); + FileSystem.ShowFileExplorer(path); + }); // Window MenuWindow = MainMenu.AddButton("Window"); @@ -1091,6 +1102,10 @@ namespace FlaxEditor.Modules _menuToolsBuildNavMesh.Enabled = canEdit; _menuToolsCancelBuilding.Enabled = GameCooker.IsRunning; _menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0; + string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData); + GameSettings settings = GameSettings.Load(); + string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName); + _menuToolsOpenLocalFolder.Enabled = Directory.Exists(path); c.PerformLayout(); } diff --git a/Source/Editor/Options/InputOptions.cs b/Source/Editor/Options/InputOptions.cs index ab473ebed..a759b7247 100644 --- a/Source/Editor/Options/InputOptions.cs +++ b/Source/Editor/Options/InputOptions.cs @@ -571,6 +571,10 @@ namespace FlaxEditor.Options [EditorDisplay("View Flags"), EditorOrder(3260)] public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift); + [DefaultValue(typeof(InputBinding), "None")] + [EditorDisplay("View Flags"), EditorOrder(3270)] + public InputBinding Particles = new InputBinding(KeyboardKeys.None); + #endregion #region Interface diff --git a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs index c4fd47f71..4a7150972 100644 --- a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs +++ b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs @@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors if (value is BoxCollider collider) collider.AutoResize(!_keepLocalOrientation); } + Presenter.OnModified(); } } diff --git a/Source/Editor/Utilities/ShuntingYardParser.cs b/Source/Editor/Utilities/ShuntingYardParser.cs index 47e2275e5..fe473389c 100644 --- a/Source/Editor/Utilities/ShuntingYardParser.cs +++ b/Source/Editor/Utilities/ShuntingYardParser.cs @@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities /// The result value. public static double Parse(string text) { + // Hack to allow parsing numbers while using "_" as a separator (like this: 1_000) + text = text.Replace("_", string.Empty); + var tokens = Tokenize(text); var rpn = OrderTokens(tokens); return EvaluateRPN(rpn); diff --git a/Source/Editor/Viewport/EditorViewport.cs b/Source/Editor/Viewport/EditorViewport.cs index c16d3d9f5..2af065c68 100644 --- a/Source/Editor/Viewport/EditorViewport.cs +++ b/Source/Editor/Viewport/EditorViewport.cs @@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog); InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight); InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals); + InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles); InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess); InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom); InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping); @@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog), new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight), new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals), + new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles), new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess), new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom), new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping), @@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport if (cm.Visible == false) return; var ccm = (ContextMenu)cm; + var flags = Task.View.Flags; foreach (var e in ccm.Items) { if (e is ContextMenuButton b && b.Tag != null) { var v = (ViewFlags)b.Tag; - b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid; + b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid; } } } diff --git a/Source/Editor/Windows/EditorOptionsWindow.cs b/Source/Editor/Windows/EditorOptionsWindow.cs index 0ee9a92d7..c6bf2fd16 100644 --- a/Source/Editor/Windows/EditorOptionsWindow.cs +++ b/Source/Editor/Windows/EditorOptionsWindow.cs @@ -45,7 +45,7 @@ namespace FlaxEditor.Windows { Parent = this }; - _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save"); + _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save."); _saveButton.Enabled = false; _tabs = new Tabs @@ -104,6 +104,8 @@ namespace FlaxEditor.Windows { _saveButton.Enabled = true; _isDataDirty = true; + if (!Title.EndsWith('*')) + Title += "*"; } } @@ -113,6 +115,8 @@ namespace FlaxEditor.Windows { _saveButton.Enabled = false; _isDataDirty = false; + if (Title.EndsWith('*')) + Title = Title.Remove(Title.Length - 1); } } diff --git a/Source/Engine/Content/Assets/Material.cpp b/Source/Engine/Content/Assets/Material.cpp index 019fd9dd8..b4cf55d4d 100644 --- a/Source/Engine/Content/Assets/Material.cpp +++ b/Source/Engine/Content/Assets/Material.cpp @@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const return false; } +#if USE_EDITOR + +void Material::GetReferences(Array& assets, Array& files) const +{ + ShaderAssetTypeBase::GetReferences(assets, files); + + // Collect references from material graph (needs to load it) + if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE)) + { + ScopeLock lock(Locker); + if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE))) + { + const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE); + if (surfaceChunk) + { + MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size()); + MaterialGraph graph; + if (!graph.Load(&stream, false)) + { + graph.GetReferences(assets); + } + } + } + } + +} + +#endif + const MaterialInfo& Material::GetInfo() const { if (_materialShader) diff --git a/Source/Engine/Content/Assets/Material.h b/Source/Engine/Content/Assets/Material.h index 4ce47b154..cd2ae8e97 100644 --- a/Source/Engine/Content/Assets/Material.h +++ b/Source/Engine/Content/Assets/Material.h @@ -38,6 +38,9 @@ public: public: // [MaterialBase] bool IsMaterialInstance() const override; +#if USE_EDITOR + void GetReferences(Array& assets, Array& files) const override; +#endif // [IMaterial] const MaterialInfo& GetInfo() const override; diff --git a/Source/Engine/Debug/DebugDraw.cpp b/Source/Engine/Debug/DebugDraw.cpp index 7c798f88f..bea9e76f4 100644 --- a/Source/Engine/Debug/DebugDraw.cpp +++ b/Source/Engine/Debug/DebugDraw.cpp @@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de return list->Get() + startIndex; } +FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest) +{ + Array* list; + if (depthTest) + list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles; + else + list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles; + const int32 startIndex = list->Count(); + list->AddUninitialized(count); + return list->Get() + startIndex; +} + inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer) { Matrix w, fw, m; @@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& col DebugTriangle t; t.Color = Color32(color); t.TimeLeft = duration; - auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest); + auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest); const Float3 origin = Context->Origin; for (int32 i = 0; i < vertices.Length();) { @@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin; for (int32 i = 0; i < indices.Length();) { @@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& co DebugTriangle t; t.Color = Color32(color); t.TimeLeft = duration; - auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest); + auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest); const Double3 origin = Context->Origin; for (int32 i = 0; i < vertices.Length();) { @@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin; for (int32 i = 0; i < indices.Length();) { diff --git a/Source/Engine/Graphics/Enums.h b/Source/Engine/Graphics/Enums.h index f6af6c16b..107fe3533 100644 --- a/Source/Engine/Graphics/Enums.h +++ b/Source/Engine/Graphics/Enums.h @@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64 /// LightsDebug = 1 << 27, + /// + /// Shows/hides particle effects. + /// + Particles = 1 << 28, + /// /// Default flags for Game. /// - DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky, + DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles, /// /// Default flags for Editor. /// - DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky, + DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles, /// /// Default flags for materials/models previews generating. /// - DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky, + DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles, }; DECLARE_ENUM_OPERATORS(ViewFlags); diff --git a/Source/Engine/Input/Input.cpp b/Source/Engine/Input/Input.cpp index 8438977b1..7048140ef 100644 --- a/Source/Engine/Input/Input.cpp +++ b/Source/Engine/Input/Input.cpp @@ -80,6 +80,8 @@ Delegate Input::MouseDoubleClick; Delegate Input::MouseWheel; Delegate Input::MouseMove; Action Input::MouseLeave; +Delegate Input::GamepadButtonDown; +Delegate Input::GamepadButtonUp; Delegate Input::TouchDown; Delegate Input::TouchMove; Delegate Input::TouchUp; @@ -1027,6 +1029,19 @@ void InputService::Update() break; } } + // TODO: route gamepad button events into global InputEvents queue to improve processing + for (int32 i = 0; i < Input::Gamepads.Count(); i++) + { + auto gamepad = Input::Gamepads[i]; + for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++) + { + GamepadButton button = (GamepadButton)buttonIdx; + if (gamepad->GetButtonDown(button)) + Input::GamepadButtonDown((InputGamepadIndex)i, button); + else if (gamepad->GetButtonUp(button)) + Input::GamepadButtonUp((InputGamepadIndex)i, button); + } + } // Update all actions for (int32 i = 0; i < Input::ActionMappings.Count(); i++) diff --git a/Source/Engine/Input/Input.h b/Source/Engine/Input/Input.h index 8cc1b2106..73e87f5f0 100644 --- a/Source/Engine/Input/Input.h +++ b/Source/Engine/Input/Input.h @@ -113,6 +113,16 @@ public: /// API_EVENT() static Action MouseLeave; + /// + /// Event fired when gamepad button goes down. + /// + API_EVENT() static Delegate GamepadButtonDown; + + /// + /// Event fired when gamepad button goes up. + /// + API_EVENT() static Delegate GamepadButtonUp; + /// /// Event fired when touch action begins. /// diff --git a/Source/Engine/Particles/ParticleEffect.cpp b/Source/Engine/Particles/ParticleEffect.cpp index 6e94594b0..9592147a7 100644 --- a/Source/Engine/Particles/ParticleEffect.cpp +++ b/Source/Engine/Particles/ParticleEffect.cpp @@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const void ParticleEffect::Draw(RenderContext& renderContext) { - if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas) + if (renderContext.View.Pass == DrawPass::GlobalSDF || + renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas || + EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles)) return; _lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition)); RenderContextBatch renderContextBatch(renderContext); @@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext) void ParticleEffect::Draw(RenderContextBatch& renderContextBatch) { + const RenderView& mainView = renderContextBatch.GetMainContext().View; + if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles)) + return; Particles::DrawParticles(renderContextBatch, this); // Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections - const RenderView& mainView = renderContextBatch.GetMainContext().View; const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius); if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds)) return; diff --git a/Source/Engine/Physics/Colliders/BoxCollider.cpp b/Source/Engine/Physics/Colliders/BoxCollider.cpp index 1e90cb91f..47e551b37 100644 --- a/Source/Engine/Physics/Colliders/BoxCollider.cpp +++ b/Source/Engine/Physics/Colliders/BoxCollider.cpp @@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value) void BoxCollider::AutoResize(bool globalOrientation = true) { Actor* parent = GetParent(); - if (Cast(parent)) + if (parent == nullptr || Cast(parent)) return; // Get bounds of all siblings (excluding itself) const Vector3 parentScale = parent->GetScale(); if (parentScale.IsAnyZero()) - return; // Avoid division by zero + return; - // Hacky way to get unrotated bounded box of parent. + // Hacky way to get unrotated bounded box of parent const Quaternion parentOrientation = parent->GetOrientation(); parent->SetOrientation(Quaternion::Identity); BoundingBox parentBox = parent->GetBox(); diff --git a/Source/Engine/Scripting/Scripting.cs b/Source/Engine/Scripting/Scripting.cs index 7f9f2980c..229e411f3 100644 --- a/Source/Engine/Scripting/Scripting.cs +++ b/Source/Engine/Scripting/Scripting.cs @@ -137,8 +137,8 @@ namespace FlaxEngine { Debug.LogError($"Unhandled Exception: {exception.Message}"); Debug.LogException(exception); - if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached) - Platform.Fatal($"Unhandled Exception: {exception}"); + //if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached) + // Platform.Fatal($"Unhandled Exception: {exception}"); } } diff --git a/Source/Engine/UI/GUI/Panels/DropPanel.cs b/Source/Engine/UI/GUI/Panels/DropPanel.cs index de80f9fc5..308272218 100644 --- a/Source/Engine/UI/GUI/Panels/DropPanel.cs +++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs @@ -11,6 +11,11 @@ namespace FlaxEngine.GUI [ActorToolbox("GUI")] public class DropPanel : ContainerControl { + /// + /// Size of the drop down icon. + /// + public const float DropDownIconSize = 14.0f; + /// /// The header height. /// @@ -368,7 +373,7 @@ namespace FlaxEngine.GUI var style = Style.Current; var enabled = EnabledInHierarchy; - // Paint Background + // Draw Background var backgroundColor = BackgroundColor; if (backgroundColor.A > 0.0f) { @@ -386,7 +391,7 @@ namespace FlaxEngine.GUI float textLeft = 0; if (EnableDropDownIcon) { - textLeft += 14; + textLeft += DropDownIconSize; var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12); var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey; if (_isClosed) @@ -395,7 +400,7 @@ namespace FlaxEngine.GUI ArrowImageOpened?.Draw(dropDownRect, arrowColor); } - // Text + // Header text var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight); _headerTextMargin.ShrinkRectangle(ref textRect); var textColor = HeaderTextColor; @@ -404,7 +409,9 @@ namespace FlaxEngine.GUI textColor *= 0.6f; } + Render2D.PushClip(textRect); Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center); + Render2D.PopClip(); if (!_isClosed && EnableContainmentLines) { diff --git a/Source/Shaders/GI/DDGI.hlsl b/Source/Shaders/GI/DDGI.hlsl index 3e31c2e53..b88b846a6 100644 --- a/Source/Shaders/GI/DDGI.hlsl +++ b/Source/Shaders/GI/DDGI.hlsl @@ -305,6 +305,8 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, T uint cascadeIndex = DDGI_DEBUG_CASCADE; #else uint cascadeIndex = 0; + if (data.CascadesCount == 0) + return float3(0, 0, 0); for (; cascadeIndex < data.CascadesCount; cascadeIndex++) { // Get cascade data diff --git a/Source/ThirdParty/meshoptimizer/allocator.cpp b/Source/ThirdParty/meshoptimizer/allocator.cpp index 12eda3872..6b6083da2 100644 --- a/Source/ThirdParty/meshoptimizer/allocator.cpp +++ b/Source/ThirdParty/meshoptimizer/allocator.cpp @@ -1,8 +1,17 @@ // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details #include "meshoptimizer.h" -void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)) +#ifdef MESHOPTIMIZER_ALLOC_EXPORT +meshopt_Allocator::Storage& meshopt_Allocator::storage() { - meshopt_Allocator::Storage::allocate = allocate; - meshopt_Allocator::Storage::deallocate = deallocate; + static Storage s = {::operator new, ::operator delete }; + return s; +} +#endif + +void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)) +{ + meshopt_Allocator::Storage& s = meshopt_Allocator::storage(); + s.allocate = allocate; + s.deallocate = deallocate; } diff --git a/Source/ThirdParty/meshoptimizer/clusterizer.cpp b/Source/ThirdParty/meshoptimizer/clusterizer.cpp index 52fe5a362..73cc0ab53 100644 --- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp +++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp @@ -6,19 +6,39 @@ #include #include +// The block below auto-detects SIMD ISA that can be used on the target platform +#ifndef MESHOPTIMIZER_NO_SIMD +#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64)) +#define SIMD_SSE +#include +#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922) +#define SIMD_NEON +#include +#endif +#endif // !MESHOPTIMIZER_NO_SIMD + // This work is based on: // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 // Jack Ritter. An Efficient Bounding Sphere. 1990 +// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008 +// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006 namespace meshopt { -// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet -const size_t kMeshletMaxVertices = 255; +// This must be <= 256 since meshlet indices are stored as bytes +const size_t kMeshletMaxVertices = 256; // A reasonable limit is around 2*max_vertices or less const size_t kMeshletMaxTriangles = 512; +// We keep a limited number of seed triangles and add a few triangles per finished meshlet +const size_t kMeshletMaxSeeds = 256; +const size_t kMeshletAddSeeds = 4; + +// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree +const int kMeshletMaxTreeDepth = 50; + struct TriangleAdjacency2 { unsigned int* counts; @@ -70,72 +90,190 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned for (size_t i = 0; i < vertex_count; ++i) { assert(adjacency.offsets[i] >= adjacency.counts[i]); - adjacency.offsets[i] -= adjacency.counts[i]; } } -static void computeBoundingSphere(float result[4], const float points[][3], size_t count) +static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) { - assert(count > 0); + size_t face_count = index_count / 3; - // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates - size_t pmin[3] = {0, 0, 0}; - size_t pmax[3] = {0, 0, 0}; + // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices + const unsigned int sparse_seen = 1u << 31; + assert(index_count < sparse_seen); + + // allocate arrays + adjacency.counts = allocator.allocate(vertex_count); + adjacency.offsets = allocator.allocate(vertex_count); + adjacency.data = allocator.allocate(index_count); + + // fill triangle counts + for (size_t i = 0; i < index_count; ++i) + assert(indices[i] < vertex_count); + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]] = 0; + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]]++; + + // fill offset table; uses sparse_seen bit to tag visited vertices + unsigned int offset = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if ((adjacency.counts[v] & sparse_seen) == 0) + { + adjacency.offsets[v] = offset; + offset += adjacency.counts[v]; + adjacency.counts[v] |= sparse_seen; + } + } + + assert(offset == index_count); + + // fill triangle data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = unsigned(i); + adjacency.data[adjacency.offsets[b]++] = unsigned(i); + adjacency.data[adjacency.offsets[c]++] = unsigned(i); + } + + // fix offsets that have been disturbed by the previous pass + // also fix counts (that were marked with sparse_seen by the first pass) + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if (adjacency.counts[v] & sparse_seen) + { + adjacency.counts[v] &= ~sparse_seen; + + assert(adjacency.offsets[v] >= adjacency.counts[v]); + adjacency.offsets[v] -= adjacency.counts[v]; + } + } +} + +static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count) +{ + // for sparse inputs, it's faster to only clear vertices referenced by the index buffer + if (vertex_count <= index_count) + memset(used, -1, vertex_count * sizeof(short)); + else + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + used[indices[i]] = -1; + } +} + +static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) +{ + static const float kAxes[7][3] = { + // X, Y, Z + {1, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + + // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length + {0.57735026f, 0.57735026f, 0.57735026f}, + {-0.57735026f, 0.57735026f, 0.57735026f}, + {0.57735026f, -0.57735026f, 0.57735026f}, + {0.57735026f, 0.57735026f, -0.57735026f}, + }; + + assert(count > 0); + assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0])); + + size_t points_stride_float = points_stride / sizeof(float); + size_t radii_stride_float = radii_stride / sizeof(float); + + // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates + size_t pmin[7], pmax[7]; + float tmin[7], tmax[7]; + + for (size_t axis = 0; axis < axis_count; ++axis) + { + pmin[axis] = pmax[axis] = 0; + tmin[axis] = FLT_MAX; + tmax[axis] = -FLT_MAX; + } for (size_t i = 0; i < count; ++i) { - const float* p = points[i]; + const float* p = points + i * points_stride_float; + float r = radii[i * radii_stride_float]; - for (int axis = 0; axis < 3; ++axis) + for (size_t axis = 0; axis < axis_count; ++axis) { - pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis]; - pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis]; + const float* ax = kAxes[axis]; + + float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2]; + float tpmin = tp - r, tpmax = tp + r; + + pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis]; + pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis]; + tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis]; + tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis]; } } // find the pair of points with largest distance - float paxisd2 = 0; - int paxis = 0; + size_t paxis = 0; + float paxisdr = 0; - for (int axis = 0; axis < 3; ++axis) + for (size_t axis = 0; axis < axis_count; ++axis) { - const float* p1 = points[pmin[axis]]; - const float* p2 = points[pmax[axis]]; + const float* p1 = points + pmin[axis] * points_stride_float; + const float* p2 = points + pmax[axis] * points_stride_float; + float r1 = radii[pmin[axis] * radii_stride_float]; + float r2 = radii[pmax[axis] * radii_stride_float]; float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]); + float dr = sqrtf(d2) + r1 + r2; - if (d2 > paxisd2) + if (dr > paxisdr) { - paxisd2 = d2; + paxisdr = dr; paxis = axis; } } // use the longest segment as the initial sphere diameter - const float* p1 = points[pmin[paxis]]; - const float* p2 = points[pmax[paxis]]; + const float* p1 = points + pmin[paxis] * points_stride_float; + const float* p2 = points + pmax[paxis] * points_stride_float; + float r1 = radii[pmin[paxis] * radii_stride_float]; + float r2 = radii[pmax[paxis] * radii_stride_float]; - float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2}; - float radius = sqrtf(paxisd2) / 2; + float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2])); + float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f; + + float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk}; + float radius = paxisdr / 2; // iteratively adjust the sphere up until all points fit for (size_t i = 0; i < count; ++i) { - const float* p = points[i]; + const float* p = points + i * points_stride_float; + float r = radii[i * radii_stride_float]; + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + float d = sqrtf(d2); - if (d2 > radius * radius) + if (d + r > radius) { - float d = sqrtf(d2); - assert(d > 0); + float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f; - float k = 0.5f + (radius / d) / 2; - - center[0] = center[0] * k + p[0] * (1 - k); - center[1] = center[1] * k + p[1] * (1 - k); - center[2] = center[2] * k + p[2] * (1 - k); - radius = (radius + d) / 2; + center[0] += k * (p[0] - center[0]); + center[1] += k * (p[1] - center[1]); + center[2] += k * (p[2] - center[2]); + radius = (radius + d + r) / 2; } } @@ -151,12 +289,12 @@ struct Cone float nx, ny, nz; }; -static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius) +static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius) { float cone = 1.f - spread * cone_weight; float cone_clamped = cone < 1e-3f ? 1e-3f : cone; - return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped; + return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped; } static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count) @@ -221,72 +359,61 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices, return mesh_area; } -static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles) +static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false) { - size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3; - - // fill 4b padding with 0 - while (offset & 3) - meshlet_triangles[offset++] = 0; -} - -static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles) -{ - unsigned char& av = used[a]; - unsigned char& bv = used[b]; - unsigned char& cv = used[c]; + short& av = used[a]; + short& bv = used[b]; + short& cv = used[c]; bool result = false; - unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); + int used_extra = (av < 0) + (bv < 0) + (cv < 0); - if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) + if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split) { meshlets[meshlet_offset] = meshlet; for (size_t j = 0; j < meshlet.vertex_count; ++j) - used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff; - - finishMeshlet(meshlet, meshlet_triangles); + used[meshlet_vertices[meshlet.vertex_offset + j]] = -1; meshlet.vertex_offset += meshlet.vertex_count; - meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding + meshlet.triangle_offset += meshlet.triangle_count * 3; meshlet.vertex_count = 0; meshlet.triangle_count = 0; result = true; } - if (av == 0xff) + if (av < 0) { - av = (unsigned char)meshlet.vertex_count; + av = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a; } - if (bv == 0xff) + if (bv < 0) { - bv = (unsigned char)meshlet.vertex_count; + bv = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b; } - if (cv == 0xff) + if (cv < 0) { - cv = (unsigned char)meshlet.vertex_count; + cv = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c; } - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv; meshlet.triangle_count++; return result; } -static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra) +static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight) { unsigned int best_triangle = ~0u; - unsigned int best_extra = 5; + int best_priority = 5; float best_score = FLT_MAX; for (size_t i = 0; i < meshlet.vertex_count; ++i) @@ -301,61 +428,159 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co unsigned int triangle = neighbors[j]; unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0); + assert(extra <= 2); + + int priority = -1; // triangles that don't add new vertices to meshlets are max. priority - if (extra != 0) - { - // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets - if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) - extra = 0; - - extra++; - } + if (extra == 0) + priority = 0; + // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets + else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) + priority = 1; + // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow + else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2) + priority = 1 + extra; + // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count + else + priority = 2 + extra; // since topology-based priority is always more important than the score, we can skip scoring in some cases - if (extra > best_extra) + if (priority > best_priority) continue; - float score = 0; + const Cone& tri_cone = triangles[triangle]; - // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles) - if (meshlet_cone) - { - const Cone& tri_cone = triangles[triangle]; + float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz; + float distance = sqrtf(dx * dx + dy * dy + dz * dz); + float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; - float distance2 = - (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) + - (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) + - (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz); - - float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz; - - score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius); - } - else - { - // each live_triangles entry is >= 1 since it includes the current triangle we're processing - score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3); - } + float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius); // note that topology-based priority is always more important than the score // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost - if (extra < best_extra || score < best_score) + if (priority < best_priority || score < best_score) { best_triangle = triangle; - best_extra = extra; + best_priority = priority; best_score = score; } } } - if (out_extra) - *out_extra = best_extra; - return best_triangle; } +static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) +{ + unsigned int best_seeds[kMeshletAddSeeds]; + unsigned int best_live[kMeshletAddSeeds]; + float best_score[kMeshletAddSeeds]; + + for (size_t i = 0; i < kMeshletAddSeeds; ++i) + { + best_seeds[i] = ~0u; + best_live[i] = ~0u; + best_score[i] = FLT_MAX; + } + + for (size_t i = 0; i < meshlet.vertex_count; ++i) + { + unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; + + unsigned int best_neighbor = ~0u; + unsigned int best_neighbor_live = ~0u; + + // find the neighbor with the smallest live metric + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; + + for (size_t j = 0; j < neighbors_size; ++j) + { + unsigned int triangle = neighbors[j]; + unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; + + unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; + + if (live < best_neighbor_live) + { + best_neighbor = triangle; + best_neighbor_live = live; + } + } + + // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate + if (best_neighbor == ~0u) + continue; + + float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz; + float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz); + + for (size_t j = 0; j < kMeshletAddSeeds; ++j) + { + // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices) + if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j])) + { + best_seeds[j] = best_neighbor; + best_live[j] = best_neighbor_live; + best_score[j] = best_neighbor_score; + break; + } + } + } + + // add surviving seeds to the meshlet + size_t seed_count = 0; + + for (size_t i = 0; i < kMeshletAddSeeds; ++i) + if (best_seeds[i] != ~0u) + seeds[seed_count++] = best_seeds[i]; + + return seed_count; +} + +static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags) +{ + size_t result = 0; + + for (size_t i = 0; i < seed_count; ++i) + { + unsigned int index = seeds[i]; + + seeds[result] = index; + result += emitted_flags[index] == 0; + } + + return result; +} + +static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) +{ + unsigned int best_seed = ~0u; + unsigned int best_live = ~0u; + float best_score = FLT_MAX; + + for (size_t i = 0; i < seed_count; ++i) + { + unsigned int index = seeds[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; + float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); + + if (live < best_live || (live == best_live && score < best_score)) + { + best_seed = index; + best_live = live; + best_score = score; + } + } + + return best_seed; +} + struct KDNode { union @@ -364,13 +589,13 @@ struct KDNode unsigned int index; }; - // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point) + // leaves: axis = 3, children = number of points including this one // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children unsigned int axis : 2; unsigned int children : 30; }; -static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot) +static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot) { size_t m = 0; @@ -400,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u result.index = indices[0]; result.axis = 3; - result.children = unsigned(count - 1); + result.children = unsigned(count); // all remaining points are stored in nodes immediately following the leaf for (size_t i = 1; i < count; ++i) @@ -415,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u return offset + count; } -static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size) +static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth) { assert(count > 0); assert(offset < node_count); @@ -441,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const } // split axis is one where the variance is largest - unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); float split = mean[axis]; size_t middle = kdtreePartition(indices, count, points, stride, axis, split); // when the partition is degenerate simply consolidate the points into a single node - if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2) + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth) return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); KDNode& result = nodes[offset]; @@ -456,35 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const result.axis = axis; // left subtree is right after our node - size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size); + size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1); // distance to the right subtree is represented explicitly + assert(next_offset - offset > 1); result.children = unsigned(next_offset - offset - 1); - return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); + return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1); } static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) { const KDNode& node = nodes[root]; + if (node.children == 0) + return; + if (node.axis == 3) { // leaf - for (unsigned int i = 0; i <= node.children; ++i) + bool inactive = true; + + for (unsigned int i = 0; i < node.children; ++i) { unsigned int index = nodes[root + i].index; if (emitted_flags[index]) continue; + inactive = false; + const float* point = points + index * stride; - float distance2 = - (point[0] - position[0]) * (point[0] - position[0]) + - (point[1] - position[1]) * (point[1] - position[1]) + - (point[2] - position[2]) * (point[2] - position[2]); - float distance = sqrtf(distance2); + float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2]; + float distance = sqrtf(dx * dx + dy * dy + dz * dz); if (distance < limit) { @@ -492,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, limit = distance; } } + + // deactivate leaves that no longer have items to emit + if (inactive) + nodes[root].children = 0; } else { @@ -500,6 +735,12 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, unsigned int first = (delta <= 0) ? 0 : node.children; unsigned int second = first ^ node.children; + // deactivate branches that no longer have items to emit to accelerate traversal + // note that we do this *before* recursing which delays deactivation but keeps tail calls + if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0) + nodes[root].children = 0; + + // recursion depth is bounded by tree depth (which is limited by construction) kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); // only process the other node if it can have a match based on closest distance so far @@ -508,6 +749,380 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, } } +struct BVHBoxT +{ + float min[4]; + float max[4]; +}; + +struct BVHBox +{ + float min[3]; + float max[3]; +}; + +#if defined(SIMD_SSE) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + __m128 min = _mm_loadu_ps(box.min); + __m128 max = _mm_loadu_ps(box.max); + + // note: over-read is safe because BVHBox array is allocated with padding + min = _mm_min_ps(min, _mm_loadu_ps(other.min)); + max = _mm_max_ps(max, _mm_loadu_ps(other.max)); + + _mm_storeu_ps(box.min, min); + _mm_storeu_ps(box.max, max); + + __m128 size = _mm_sub_ps(max, min); + __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1)); + __m128 mul = _mm_mul_ps(size, size_yzx); + __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1))); + __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2))); + + return _mm_cvtss_f32(sum_xyz); +} +#elif defined(SIMD_NEON) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + float32x4_t min = vld1q_f32(box.min); + float32x4_t max = vld1q_f32(box.max); + + // note: over-read is safe because BVHBox array is allocated with padding + min = vminq_f32(min, vld1q_f32(other.min)); + max = vmaxq_f32(max, vld1q_f32(other.max)); + + vst1q_f32(box.min, min); + vst1q_f32(box.max, max); + + float32x4_t size = vsubq_f32(max, min); + float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2); + float32x4_t mul = vmulq_f32(size, size_yzx); + float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1); + float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2); + + return sum_xyz; +} +#else +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + for (int k = 0; k < 3; ++k) + { + box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k]; + box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k]; + } + + float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2]; + return sx * sy + sx * sz + sy * sz; +} +#endif + +inline unsigned int radixFloat(unsigned int v) +{ + // if sign bit is 0, flip sign bit + // if sign bit is 1, flip everything + unsigned int mask = (int(v) >> 31) | 0x80000000; + return v ^ mask; +} + +static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count) +{ + memset(hist, 0, sizeof(hist)); + + const unsigned int* bits = reinterpret_cast(data); + + // compute 3 10-bit histograms in parallel (dropping 2 LSB) + for (size_t i = 0; i < count; ++i) + { + unsigned int id = radixFloat(bits[i]); + + hist[(id >> 2) & 1023][0]++; + hist[(id >> 12) & 1023][1]++; + hist[(id >> 22) & 1023][2]++; + } + + unsigned int sum0 = 0, sum1 = 0, sum2 = 0; + + // replace histogram data with prefix histogram sums in-place + for (int i = 0; i < 1024; ++i) + { + unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; + + hist[i][0] = sum0; + hist[i][1] = sum1; + hist[i][2] = sum2; + + sum0 += hx; + sum1 += hy; + sum2 += hz; + } + + assert(sum0 == count && sum1 == count && sum2 == count); +} + +static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass) +{ + const unsigned int* bits = reinterpret_cast(keys); + int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes + + for (size_t i = 0; i < count; ++i) + { + unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023; + + destination[hist[id][pass]++] = source[i]; + } +} + +static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float) +{ + (void)vertex_count; + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + const float* va = vertex_positions + vertex_stride_float * a; + const float* vb = vertex_positions + vertex_stride_float * b; + const float* vc = vertex_positions + vertex_stride_float * c; + + BVHBox& box = boxes[i]; + + for (int k = 0; k < 3; ++k) + { + box.min[k] = va[k] < vb[k] ? va[k] : vb[k]; + box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k]; + + box.max[k] = va[k] > vb[k] ? va[k] : vb[k]; + box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k]; + + centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f; + } + } +} + +static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL) +{ + // count number of unique vertices + size_t used_vertices = 0; + for (size_t i = 0; i < count; ++i) + { + unsigned int index = order[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0); + used[a] = used[b] = used[c] = 1; + + if (out) + out[i] = unsigned(used_vertices); + } + + // reset used[] for future invocations + for (size_t i = 0; i < count; ++i) + { + unsigned int index = order[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + used[a] = used[b] = used[c] = -1; + } + + return used_vertices; +} + +static void bvhPackLeaf(unsigned char* boundary, size_t count) +{ + // mark meshlet boundary for future reassembly + assert(count > 0); + + boundary[0] = 1; + memset(boundary + 1, 0, count - 1); +} + +static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles) +{ + for (size_t i = 0; i < count;) + { + size_t chunk = i + max_triangles <= count ? max_triangles : count - i; + + if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices) + { + bvhPackLeaf(boundary + i, chunk); + i += chunk; + continue; + } + + // chunk is vertex bound, split it into smaller meshlets + assert(chunk > max_vertices / 3); + + bvhPackLeaf(boundary + i, max_vertices / 3); + i += max_vertices / 3; + } +} + +static bool bvhDivisible(size_t count, size_t min, size_t max) +{ + // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)] + // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder) + // we avoid expensive integer divisions in the common case where min is <= max/2 + return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min); +} + +static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count) +{ + BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}}; + BVHBoxT accumr = accuml; + + for (size_t i = 0; i < count; ++i) + { + float larea = boxMerge(accuml, boxes[order[i]]); + float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]); + + areas[i] = larea; + areas[i + count] = rarea; + } +} + +static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost) +{ + bool aligned = count >= min * 2 && bvhDivisible(count, min, max); + size_t end = aligned ? count - min : count - 1; + + float rmaxfill = 1.f / float(int(maxfill)); + + // find best split that minimizes SAH + size_t bestsplit = 0; + float bestcost = FLT_MAX; + + for (size_t i = min - 1; i < end; i += step) + { + size_t lsplit = i + 1, rsplit = count - (i + 1); + + if (!bvhDivisible(lsplit, min, max)) + continue; + if (aligned && !bvhDivisible(rsplit, min, max)) + continue; + + // areas[x] = inclusive surface area of boxes[0..x] + // areas[count-1-x] = inclusive surface area of boxes[x..count-1] + float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count]; + float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit)); + + if (cost > bestcost) + continue; + + // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count + // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice + size_t lfill = vertices ? vertices[i] : lsplit; + size_t rfill = vertices ? vertices[i] : rsplit; + + // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo + int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill); + int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill); + + cost += fill * (float(lrest) * larea + float(rrest) * rarea); + + if (cost < bestcost) + { + bestcost = cost; + bestsplit = i + 1; + } + } + + *out_cost = bestcost; + return bestsplit; +} + +static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count) +{ + size_t l = 0, r = split; + + for (size_t i = 0; i < count; ++i) + { + unsigned char side = sides[order[i]]; + target[side ? r : l] = order[i]; + l += 1; + l -= side; + r += side; + } + + assert(l == split && r == count); +} + +static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices) + return bvhPackLeaf(boundary, count); + + unsigned int* axes[3] = {orderx, ordery, orderz}; + + // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max + size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1; + + // if we could not pack the meshlet, we must be vertex bound + size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles; + size_t maxfill = count <= max_triangles ? max_vertices : max_triangles; + + // find best split that minimizes SAH + int bestk = -1; + size_t bestsplit = 0; + float bestcost = FLT_MAX; + + for (int k = 0; k < 3; ++k) + { + float* areas = static_cast(scratch); + unsigned int* vertices = NULL; + + bvhComputeArea(areas, boxes, axes[k], count); + + if (count <= max_triangles) + { + // for vertex bound clusters, count number of unique vertices for each split + vertices = reinterpret_cast(areas + 2 * count); + bvhCountVertices(axes[k], count, used, indices, vertices); + } + + float axiscost = FLT_MAX; + size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost); + + if (axissplit && axiscost < bestcost) + { + bestk = k; + bestcost = axiscost; + bestsplit = axissplit; + } + } + + // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs + if (bestk < 0 || depth >= kMeshletMaxTreeDepth) + return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); + + // mark sides of split for partitioning + unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int); + + for (size_t i = 0; i < bestsplit; ++i) + sides[axes[bestk][i]] = 0; + + for (size_t i = bestsplit; i < count; ++i) + sides[axes[bestk][i]] = 1; + + // partition all axes into two sides, maintaining order + unsigned int* temp = static_cast(scratch); + + for (int k = 0; k < 3; ++k) + { + if (k == bestk) + continue; + + unsigned int* axis = axes[k]; + memcpy(temp, axis, sizeof(unsigned int) * count); + bvhPartition(axis, temp, sides, bestsplit, count); + } + + // recursion depth is bounded due to max depth check above + bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); + bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); +} + } // namespace meshopt size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles) @@ -517,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ assert(index_count % 3 == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned (void)kMeshletMaxVertices; (void)kMeshletMaxTriangles; @@ -532,7 +1146,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles; } -size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) +size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor) { using namespace meshopt; @@ -541,18 +1155,24 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve assert(vertex_positions_stride % sizeof(float) == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); assert(cone_weight >= 0 && cone_weight <= 1); + assert(split_factor >= 0); + + if (index_count == 0) + return 0; meshopt_Allocator allocator; TriangleAdjacency2 adjacency = {}; - buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); + if (vertex_count > index_count && index_count < (1u << 31)) + buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator); + else + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); - unsigned int* live_triangles = allocator.allocate(vertex_count); - memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match + unsigned int* live_triangles = adjacency.counts; size_t face_count = index_count / 3; @@ -573,11 +1193,45 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve kdindices[i] = unsigned(i); KDNode* nodes = allocator.allocate(face_count * 2); - kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8); + kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0); - // index of the vertex in the meshlet, 0xff if the vertex isn't used - unsigned char* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count); + // find a specific corner of the mesh to use as a starting point for meshlet flow + float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX; + + for (size_t i = 0; i < face_count; ++i) + { + const Cone& tri = triangles[i]; + + cornerx = cornerx > tri.px ? tri.px : cornerx; + cornery = cornery > tri.py ? tri.py : cornery; + cornerz = cornerz > tri.pz ? tri.pz : cornerz; + } + + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + clearUsed(used, vertex_count, indices, index_count); + + // initial seed triangle is the one closest to the corner + unsigned int initial_seed = ~0u; + float initial_score = FLT_MAX; + + for (size_t i = 0; i < face_count; ++i) + { + const Cone& tri = triangles[i]; + + float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); + + if (initial_seed == ~0u || score < initial_score) + { + initial_seed = unsigned(i); + initial_score = score; + } + } + + // seed triangles to continue meshlet flow + unsigned int seeds[kMeshletMaxSeeds] = {}; + size_t seed_count = 0; meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -588,46 +1242,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve { Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); - unsigned int best_extra = 0; - unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra); + unsigned int best_triangle = ~0u; - // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring - if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) - { - best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL); - } + // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed + // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring + if (meshlet_offset == 0 && meshlet.triangle_count == 0) + best_triangle = initial_seed; + else + best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight); - // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity + bool split = false; + + // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity if (best_triangle == ~0u) { float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz}; unsigned int index = ~0u; - float limit = FLT_MAX; + float distance = FLT_MAX; - kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit); + kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance); best_triangle = index; + split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor; } if (best_triangle == ~0u) break; + int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0); + + // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow + if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) + { + seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags); + seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds; + seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz); + + unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz); + + // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency + best_triangle = best_seed != ~0u ? best_seed : best_triangle; + } + unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2]; assert(a < vertex_count && b < vertex_count && c < vertex_count); // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds - if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles)) + if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split)) { meshlet_offset++; memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc)); } - live_triangles[a]--; - live_triangles[b]--; - live_triangles[c]--; - // remove emitted triangle from adjacency data // this makes sure that we spend less time traversing these lists on subsequent iterations + // live triangle counts are updated as a byproduct of these adjustments for (size_t k = 0; k < 3; ++k) { unsigned int index = indices[best_triangle * 3 + k]; @@ -656,20 +1325,23 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve meshlet_cone_acc.ny += triangles[best_triangle].ny; meshlet_cone_acc.nz += triangles[best_triangle].nz; + assert(!emitted_flags[best_triangle]); emitted_flags[best_triangle] = 1; } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } - assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } +size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) +{ + return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f); +} + size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) { using namespace meshopt; @@ -678,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned meshopt_Allocator allocator; - // index of the vertex in the meshlet, 0xff if the vertex isn't used - unsigned char* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count); + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + clearUsed(used, vertex_count, indices, index_count); meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -699,13 +1370,109 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); + return meshlet_offset; +} + +size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); + + if (index_count == 0) + return 0; + + size_t face_count = index_count / 3; + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + meshopt_Allocator allocator; + + // 3 floats plus 1 uint for sorting, or + // 2 floats plus 1 uint for pivoting, or + // 1 uint plus 1 byte for partitioning + float* scratch = allocator.allocate(face_count * 4); + + // compute bounding boxes and centroids for sorting + BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD + bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float); + memset(boxes + face_count, 0, sizeof(BVHBox)); + + unsigned int* axes = allocator.allocate(face_count * 3); + unsigned int* temp = reinterpret_cast(scratch) + face_count * 3; + + for (int k = 0; k < 3; ++k) + { + unsigned int* order = axes + k * face_count; + const float* keys = scratch + k * face_count; + + unsigned int hist[1024][3]; + computeHistogram(hist, keys, face_count); + + // 3-pass radix sort computes the resulting order into axes + for (size_t i = 0; i < face_count; ++i) + temp[i] = unsigned(i); + + radixPass(order, temp, keys, face_count, hist, 0); + radixPass(temp, order, keys, face_count, hist, 1); + radixPass(order, temp, keys, face_count, hist, 2); + } + + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + clearUsed(used, vertex_count, indices, index_count); + + unsigned char* boundary = allocator.allocate(face_count); + + bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); + + // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound + size_t meshlet_count = 0; + for (size_t i = 0; i < face_count; ++i) + { + assert(boundary[i] <= 1); + meshlet_count += boundary[i]; + } + + size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles); + + // pack triangles into meshlets according to the order and boundaries marked by bvhSplit + meshopt_Meshlet meshlet = {}; + size_t meshlet_offset = 0; + size_t meshlet_pending = meshlet_count; + + for (size_t i = 0; i < face_count; ++i) + { + assert(boundary[i] <= 1); + bool split = i > 0 && boundary[i] == 1; + + // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space + if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound) + split = false; + + unsigned int index = axes[i]; + assert(index < face_count); + + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + // appends triangle to the meshlet and writes previous meshlet to the output if full + meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split); + meshlet_pending -= boundary[i]; + } + + if (meshlet.triangle_count) + meshlets[meshlet_offset++] = meshlet; + + assert(meshlet_offset <= meshlet_bound); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } @@ -765,15 +1532,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t if (triangles == 0) return bounds; + const float rzero = 0.f; + // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well float psphere[4] = {}; - computeBoundingSphere(psphere, corners[0], triangles * 3); + computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7); float center[3] = {psphere[0], psphere[1], psphere[2]}; // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis float nsphere[4] = {}; - computeBoundingSphere(nsphere, normals, triangles); + computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3); float axis[3] = {nsphere[0], nsphere[1], nsphere[2]}; float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]); @@ -883,6 +1652,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); } +meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride) +{ + using namespace meshopt; + + assert(positions_stride >= 12 && positions_stride <= 256); + assert(positions_stride % sizeof(float) == 0); + assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL); + assert(radii_stride % sizeof(float) == 0); + + meshopt_Bounds bounds = {}; + + if (count == 0) + return bounds; + + const float rzero = 0.f; + + float psphere[4] = {}; + computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7); + + bounds.center[0] = psphere[0]; + bounds.center[1] = psphere[1]; + bounds.center[2] = psphere[2]; + bounds.radius = psphere[3]; + + return bounds; +} + void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count) { using namespace meshopt; @@ -950,25 +1746,28 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh // reorder meshlet vertices for access locality assuming index buffer is scanned sequentially unsigned int order[kMeshletMaxVertices]; - unsigned char remap[kMeshletMaxVertices]; - memset(remap, -1, vertex_count); + short remap[kMeshletMaxVertices]; + memset(remap, -1, vertex_count * sizeof(short)); size_t vertex_offset = 0; for (size_t i = 0; i < triangle_count * 3; ++i) { - unsigned char& r = remap[indices[i]]; + short& r = remap[indices[i]]; - if (r == 0xff) + if (r < 0) { - r = (unsigned char)(vertex_offset); + r = short(vertex_offset); order[vertex_offset] = vertices[indices[i]]; vertex_offset++; } - indices[i] = r; + indices[i] = (unsigned char)r; } assert(vertex_offset <= vertex_count); memcpy(vertices, order, vertex_offset * sizeof(unsigned int)); } + +#undef SIMD_SSE +#undef SIMD_NEON diff --git a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp similarity index 58% rename from Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp rename to Source/ThirdParty/meshoptimizer/indexanalyzer.cpp index 368274382..87ceeae66 100644 --- a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp +++ b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp @@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind return result; } + +meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + meshopt_VertexFetchStatistics result = {}; + + unsigned char* vertex_visited = allocator.allocate(vertex_count); + memset(vertex_visited, 0, vertex_count); + + const size_t kCacheLine = 64; + const size_t kCacheSize = 128 * 1024; + + // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway + size_t cache[kCacheSize / kCacheLine] = {}; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + vertex_visited[index] = 1; + + size_t start_address = index * vertex_size; + size_t end_address = start_address + vertex_size; + + size_t start_tag = start_address / kCacheLine; + size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine; + + assert(start_tag < end_tag); + + for (size_t tag = start_tag; tag < end_tag; ++tag) + { + size_t line = tag % (sizeof(cache) / sizeof(cache[0])); + + // we store +1 since cache is filled with 0 by default + result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine; + cache[line] = tag + 1; + } + } + + size_t unique_vertex_count = 0; + + for (size_t i = 0; i < vertex_count; ++i) + unique_vertex_count += vertex_visited[i]; + + result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size); + + return result; +} diff --git a/Source/ThirdParty/meshoptimizer/indexcodec.cpp b/Source/ThirdParty/meshoptimizer/indexcodec.cpp index b30046005..7a8fd6867 100644 --- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp +++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp @@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0; const unsigned char kSequenceHeader = 0xd0; static int gEncodeIndexVersion = 1; +const int kDecodeIndexVersion = 1; typedef unsigned int VertexFifo[16]; typedef unsigned int EdgeFifo[16][2]; @@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons if (fer >= 0 && (fer >> 2) < 15) { + // note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge const unsigned int* order = kTriangleIndexOrder[fer & 3]; unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]]; @@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons int fc = getVertexFifo(vertexfifo, c, vertexfifooffset); // after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a + // note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation int fea = (a == next) ? (next++, 0) : 15; int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15); int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15); @@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count) void meshopt_encodeIndexVersion(int version) { - assert(unsigned(version) <= 1); + assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion)); meshopt::gEncodeIndexVersion = version; } +int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size) +{ + if (buffer_size < 1) + return -1; + + unsigned char header = buffer[0]; + + if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader) + return -1; + + int version = header & 0x0f; + if (version > meshopt::kDecodeIndexVersion) + return -1; + + return version; +} + int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size) { using namespace meshopt; @@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde return -1; int version = buffer[0] & 0x0f; - if (version > 1) + if (version > kDecodeIndexVersion) return -1; EdgeFifo edgefifo; @@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde // fifo reads are wrapped around 16 entry buffer unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0]; unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1]; + unsigned int c = 0; int fec = codetri & 15; @@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde { // fifo reads are wrapped around 16 entry buffer unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15]; - unsigned int c = (fec == 0) ? next : cf; + c = (fec == 0) ? next : cf; int fec0 = fec == 0; next += fec0; - // output triangle - writeTriangle(destination, i, index_size, a, b, c); - - // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); - - pushEdgeFifo(edgefifo, c, b, edgefifooffset); - pushEdgeFifo(edgefifo, a, c, edgefifooffset); } else { - unsigned int c = 0; - // fec - (fec ^ 3) decodes 13, 14 into -1, 1 // note that we need to update the last index since free indices are delta-encoded last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last); - // output triangle - writeTriangle(destination, i, index_size, a, b, c); - // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly pushVertexFifo(vertexfifo, c, vertexfifooffset); - - pushEdgeFifo(edgefifo, c, b, edgefifooffset); - pushEdgeFifo(edgefifo, a, c, edgefifooffset); } + + // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); } else { @@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in return -1; int version = buffer[0] & 0x0f; - if (version > 1) + if (version > kDecodeIndexVersion) return -1; const unsigned char* data = buffer + 1; diff --git a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp index f6728345a..4bf9fccad 100644 --- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp +++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp @@ -5,7 +5,9 @@ #include // This work is based on: +// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 // John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010 +// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024 namespace meshopt { @@ -85,6 +87,46 @@ struct VertexStreamHasher } }; +struct VertexCustomHasher +{ + const float* vertex_positions; + size_t vertex_stride_float; + + int (*callback)(void*, unsigned int, unsigned int); + void* context; + + size_t hash(unsigned int index) const + { + const unsigned int* key = reinterpret_cast(vertex_positions + index * vertex_stride_float); + + unsigned int x = key[0], y = key[1], z = key[2]; + + // replace negative zero with zero + x = (x == 0x80000000) ? 0 : x; + y = (y == 0x80000000) ? 0 : y; + z = (z == 0x80000000) ? 0 : z; + + // scramble bits to make sure that integer coordinates have entropy in lower bits + x ^= x >> 17; + y ^= y >> 17; + z ^= z >> 17; + + // Optimized Spatial Hashing for Collision Detection of Deformable Objects + return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791); + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + const float* lp = vertex_positions + lhs * vertex_stride_float; + const float* rp = vertex_positions + rhs * vertex_stride_float; + + if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2]) + return false; + + return callback ? callback(context, lhs, rhs) : true; + } +}; + struct EdgeHasher { const unsigned int* remap; @@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position allocator.deallocate(vertex_table); } +template +static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator) +{ + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices ? indices[i] : unsigned(i); + assert(index < vertex_count); + + if (remap[index] != ~0u) + continue; + + unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u); + + if (*entry == ~0u) + { + *entry = index; + remap[index] = next_vertex++; + } + else + { + assert(remap[*entry] != ~0u); + remap[index] = remap[*entry]; + } + } + + assert(next_vertex <= vertex_count); + return next_vertex; +} + template static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) { @@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex } } +template +static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator) +{ + unsigned int* remap = allocator.allocate(vertex_count); + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + if (remap[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + destination[i] = remap[index]; + } +} + } // namespace meshopt size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) @@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int assert(vertex_size > 0 && vertex_size <= 256); meshopt_Allocator allocator; - - memset(destination, -1, vertex_count * sizeof(unsigned int)); - VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_size}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); - - unsigned int next_vertex = 0; - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices ? indices[i] : unsigned(i); - assert(index < vertex_count); - - if (destination[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); - - if (*entry == ~0u) - { - *entry = index; - - destination[index] = next_vertex++; - } - else - { - assert(destination[*entry] != ~0u); - - destination[index] = destination[*entry]; - } - } - } - - assert(next_vertex <= vertex_count); - - return next_vertex; + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); } size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) @@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne } meshopt_Allocator allocator; - - memset(destination, -1, vertex_count * sizeof(unsigned int)); - VertexStreamHasher hasher = {streams, stream_count}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); +} - unsigned int next_vertex = 0; +size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context) +{ + using namespace meshopt; - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices ? indices[i] : unsigned(i); - assert(index < vertex_count); + assert(indices || index_count == vertex_count); + assert(!indices || index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); - if (destination[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + meshopt_Allocator allocator; + VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context}; - if (*entry == ~0u) - { - *entry = index; - - destination[index] = next_vertex++; - } - else - { - assert(destination[*entry] != ~0u); - - destination[index] = destination[*entry]; - } - } - } - - assert(next_vertex <= vertex_count); - - return next_vertex; + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); } void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) @@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned assert(vertex_size <= vertex_stride); meshopt_Allocator allocator; - - unsigned int* remap = allocator.allocate(vertex_count); - memset(remap, -1, vertex_count * sizeof(unsigned int)); - VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_stride}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices[i]; - assert(index < vertex_count); - - if (remap[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); - - if (*entry == ~0u) - *entry = index; - - remap[index] = *entry; - } - - destination[i] = remap[index]; - } + generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator); } void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) @@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns } meshopt_Allocator allocator; - - unsigned int* remap = allocator.allocate(vertex_count); - memset(remap, -1, vertex_count * sizeof(unsigned int)); - VertexStreamHasher hasher = {streams, stream_count}; + generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator); +} + +void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL}; + size_t table_size = hashBuckets(vertex_count); unsigned int* table = allocator.allocate(table_size); memset(table, -1, table_size * sizeof(unsigned int)); - for (size_t i = 0; i < index_count; ++i) + for (size_t i = 0; i < vertex_count; ++i) { - unsigned int index = indices[i]; - assert(index < vertex_count); + unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u); - if (remap[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + if (*entry == ~0u) + *entry = unsigned(i); - if (*entry == ~0u) - *entry = index; - - remap[index] = *entry; - } - - destination[i] = remap[index]; + destination[i] = *entry; } } @@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un memcpy(destination + i * 4, patch, sizeof(patch)); } } + +size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count) +{ + assert(index_count % 3 == 0); + + meshopt_Allocator allocator; + + unsigned int* remap = allocator.allocate(vertex_count); + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + // compute vertex valence; this is used to prioritize least used corner + // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic + unsigned char* valence = allocator.allocate(vertex_count); + memset(valence, 0, vertex_count); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + valence[index]++; + } + + unsigned int reorder_offset = 0; + + // assign provoking vertices; leave the rest for the next pass + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + // try to rotate triangle such that provoking vertex hasn't been seen before + // if multiple vertices are new, prioritize the one with least valence + // this reduces the risk that a future triangle will have all three vertices seen + unsigned int va = remap[a] == ~0u ? valence[a] : ~0u; + unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u; + unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u; + + if (vb != ~0u && vb <= va && vb <= vc) + { + // abc -> bca + unsigned int t = a; + a = b, b = c, c = t; + } + else if (vc != ~0u && vc <= va && vc <= vb) + { + // abc -> cab + unsigned int t = c; + c = b, b = a, a = t; + } + + unsigned int newidx = reorder_offset; + + // now remap[a] = ~0u or all three vertices are old + // recording remap[a] makes it possible to remap future references to the same index, conserving space + if (remap[a] == ~0u) + remap[a] = newidx; + + // we need to clone the provoking vertex to get a unique index + // if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these + reorder[reorder_offset++] = a; + + // note: first vertex is final, the other two will be fixed up in next pass + destination[i + 0] = newidx; + destination[i + 1] = b; + destination[i + 2] = c; + + // update vertex valences for corner heuristic + valence[a]--; + valence[b]--; + valence[c]--; + } + + // remap or clone non-provoking vertices (iterating to skip provoking vertices) + int step = 1; + + for (size_t i = 1; i < index_count; i += step, step ^= 3) + { + unsigned int index = destination[i]; + + if (remap[index] == ~0u) + { + // we haven't seen the vertex before as a provoking vertex + // to maintain the reference to the original vertex we need to clone it + unsigned int newidx = reorder_offset; + + remap[index] = newidx; + reorder[reorder_offset++] = index; + } + + destination[i] = remap[index]; + } + + assert(reorder_offset <= vertex_count + index_count / 3); + return reorder_offset; +} diff --git a/Source/ThirdParty/meshoptimizer/meshoptimizer.h b/Source/ThirdParty/meshoptimizer/meshoptimizer.h index 6c8dcd7e8..c9239bc30 100644 --- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h +++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h @@ -1,7 +1,7 @@ /** - * meshoptimizer - version 0.21 + * meshoptimizer - version 1.0 * - * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer * * This library is distributed under the MIT License. See notice at the end of this file. @@ -12,7 +12,7 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 210 /* 0.21 */ +#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -29,11 +29,14 @@ #endif /* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */ +#ifndef MESHOPTIMIZER_EXPERIMENTAL #define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API +#endif /* C interface */ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif /** @@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, */ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); +/** + * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices + * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence. + * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided). + * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * indices can be NULL if the input is unindexed + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not + */ +MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context); + /** * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap * @@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati */ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); +/** + * Generates a remap table that maps all vertices with the same position to the same (existing) index. + * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering. + * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + /** * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology * Each triangle is converted into a 6-vertex patch with the following layout: @@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin */ MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +/** + * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table + * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute. + * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader. + * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data. + * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering. + * For maximum efficiency the input index buffer should be optimized for vertex cache first. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements) + */ +MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count); + /** * Vertex transform cache optimizer * Reorders indices to reduce the number of GPU vertex shader invocations - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) */ @@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio * Vertex transform cache optimizer for FIFO caches * Reorders indices to reduce the number of GPU vertex shader invocations * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) * cache_size should be less than the actual GPU cache size to avoid cache thrashing @@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination /** * Overdraw optimizer * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!) @@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const * Vertex fetch cache optimizer * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused - * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream. + * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream. * * destination must contain enough space for the resulting vertex buffer (vertex_count elements) * indices is used both as an input and as an output index buffer @@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count); /** - * Set index encoder format version + * Set index encoder format version (defaults to 1) + * * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+) */ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); @@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); */ MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); +/** + * Get encoded index format version + * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid + * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed. + */ +MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size); + /** * Index sequence encoder * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original. @@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream. * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized. + * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first. * * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); /** - * Set vertex encoder format version - * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) + * Vertex buffer encoder + * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level. + * For compression level to take effect, the vertex encoding version must be set to 1. + * The default compression level implied by meshopt_encodeVertexBuffer is 2. + * + * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) + * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. + * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0. + */ +MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version); + +/** + * Set vertex encoder format version (defaults to 1) + * + * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+) */ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); @@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); * The decoder is safe to use for untrusted input, but it may produce garbage data. * * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size); +/** + * Get encoded vertex format version + * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid + * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed. + */ +MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size); + /** * Vertex buffer filters * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place. * - * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f. + * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f. * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. * - * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct. + * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct. * Each component is stored as an 16-bit integer; stride must be equal to 8. * * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. + * + * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); /** * Vertex buffer filter encoders * These functions can be used to encode data in a format that meshopt_decodeFilter can decode * - * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output. - * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. + * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is. * Input data must contain 4 floats for every vector (count*4 total). * * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding. @@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24). * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4. * Input data must contain stride/4 floats for every vector (count*stride/4 total). + * + * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits. + * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8. + * Input data must contain 4 floats for every color (count*4 total). */ enum meshopt_EncodeExpMode { @@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode meshopt_EncodeExpSharedVector, /* When encoding exponents, use shared value for each component of all vectors (best compression) */ meshopt_EncodeExpSharedComponent, + /* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */ + meshopt_EncodeExpClamped, }; -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); +MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); +MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); /** * Simplification options @@ -334,16 +416,34 @@ enum meshopt_SimplifySparse = 1 << 1, /* Treat error limit and resulting error as absolute instead of relative to mesh extents. */ meshopt_SimplifyErrorAbsolute = 1 << 2, + /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */ + meshopt_SimplifyPrune = 1 << 3, + /* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */ + meshopt_SimplifyRegularize = 1 << 4, + /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */ + meshopt_SimplifyPermissive = 1 << 5, +}; + +/** + * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs + */ +enum +{ + /* Do not move this vertex. */ + meshopt_SimplifyVertex_Lock = 1 << 0, + /* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */ + meshopt_SimplifyVertex_Protect = 1 << 1, }; /** * Mesh simplifier * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. - * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. * Returns the number of indices after simplification, with destination containing new index data + * * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! * vertex_positions should have float3 position in the first 12 bytes of each vertex @@ -354,45 +454,94 @@ enum MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier with attribute metric - * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details. - * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes. + * Mesh simplifier with attribute metric + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. + * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order. + * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. + * Returns the number of indices after simplification, with destination containing new index data * + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * Note that the number of attributes with non-zero weights affects memory requirements and running time. + * + * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! + * vertex_positions should have float3 position in the first 12 bytes of each vertex * vertex_attributes should have attribute_count floats for each vertex - * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range. - * attribute_count must be <= 16 + * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position + * attribute_count must be <= 32 * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved - * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier (sloppy) + * Mesh simplifier with position/attribute update + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. + * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance. + * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. + * Returns the number of indices after simplification, indices are destructively updated with new index data + * + * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place. + * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification. + * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated. + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * vertex_attributes should have attribute_count floats for each vertex + * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position + * attribute_count must be <= 32 + * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification + */ +MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); + +/** + * Mesh simplifier (sloppy) * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error. * Returns the number of indices after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! * vertex_positions should have float3 position in the first 12 bytes of each vertex + * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); /** - * Experimental: Point cloud simplifier + * Mesh simplifier (pruner) + * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh + * Returns the number of indices after simplification, with destination containing new index data + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * + * destination must contain enough space for the target index buffer, worst case is index_count elements + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + */ +MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); + +/** + * Point cloud simplifier * Reduces the number of points in the cloud to reach the given target * Returns the number of points after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer (target_vertex_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex - * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex + * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex + * color_weight determines relative priority of color wrt position; 1.0 is a safe default */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count); +MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count); /** * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents @@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics */ MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size); +struct meshopt_VertexFetchStatistics +{ + unsigned int bytes_fetched; + float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */ +}; + +/** + * Vertex fetch cache analyzer + * Returns cache hit statistics using a simplified direct mapped model + * Results may not match actual GPU performance + */ +MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size); + struct meshopt_OverdrawStatistics { unsigned int pixels_covered; @@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics */ MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); -struct meshopt_VertexFetchStatistics +struct meshopt_CoverageStatistics { - unsigned int bytes_fetched; - float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */ + float coverage[3]; + float extent; /* viewport size in mesh coordinates */ }; /** - * Vertex fetch cache analyzer - * Returns cache hit statistics using a simplified direct mapped model - * Results may not match actual GPU performance + * Coverage analyzer + * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +/** + * Meshlet is a small mesh cluster (subset) that consists of: + * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use; + * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from. + * + * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data. + */ struct meshopt_Meshlet { /* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */ unsigned int vertex_offset; unsigned int triangle_offset; - /* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */ + /* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */ unsigned int vertex_count; unsigned int triangle_count; }; @@ -484,14 +654,15 @@ struct meshopt_Meshlet * Meshlet builder * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers. + * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet. * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters. * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first. * * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4) + * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512) * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); @@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); /** - * Experimental: Meshlet optimizer - * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput + * Meshlet builder with flexible cluster sizes + * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet. + * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor. * - * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these - * need to be computed from meshlet's vertex_offset and triangle_offset - * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512) + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) + * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency + * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); + +/** + * Meshlet builder that produces clusters optimized for raytracing + * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet. + * + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) + * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default + */ +MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); + +/** + * Meshlet optimizer + * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes. + * + * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset + * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512) + */ +MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); struct meshopt_Bounds { @@ -544,11 +742,35 @@ struct meshopt_Bounds * Real-Time Rendering 4th Edition, section 19.3). * * vertex_positions should have float3 position in the first 12 bytes of each vertex - * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size) + * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet + * index_count/3 and triangle_count must not exceed implementation limits (<= 512) */ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +/** + * Sphere bounds generator + * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0. + * + * positions should have float3 position in the first 12 bytes of each element + * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element + */ +MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride); + +/** + * Cluster partitioner + * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other. + * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs. + * + * destination must contain enough space for the resulting partition data (cluster_count elements) + * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function + * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially + * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count + * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex + * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3) + */ +MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); + /** * Spatial sorter * Generates a remap table that can be used to reorder points for spatial locality. @@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** - * Experimental: Spatial sorter + * Spatial sorter * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache. * * destination must contain enough space for the resulting index buffer (index_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Spatial clusterizer + * Reorders points into clusters optimized for spatial locality, and generates a new index buffer. + * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size. + * + * destination must contain enough space for the resulting index buffer (vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size); + +/** + * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value + * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest + * Representable magnitude range: [6e-5; 65504] + * Maximum relative reconstruction error: 5e-4 + */ +MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v); + +/** + * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation + * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest + * Assumes N is in a valid mantissa precision range, which is 1..23 + */ +MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); + +/** + * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value + * Preserves Inf/NaN, flushes denormals to zero + */ +MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h); /** * Set allocation callbacks @@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti * Note that all algorithms only allocate memory for temporary use. * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. */ -MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*)); +MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)); #ifdef __cplusplus } /* extern "C" */ #endif -/* Quantization into commonly supported data formats */ +/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */ #ifdef __cplusplus /** * Quantize a float in [0..1] range into an N-bit fixed point unorm value @@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N); * Maximum reconstruction error: 1/2^N */ inline int meshopt_quantizeSnorm(float v, int N); - -/** - * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value - * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest - * Representable magnitude range: [6e-5; 65504] - * Maximum relative reconstruction error: 5e-4 - */ -MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v); - -/** - * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation - * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest - * Assumes N is in a valid mantissa precision range, which is 1..23 - */ -MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); - -/** - * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value - * Preserves Inf/NaN, flushes denormals to zero - */ -MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h); #endif /** @@ -631,6 +863,10 @@ template inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); template inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count); +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback); +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback); template inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap); template @@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice template inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template +inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count); +template inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count); template inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count); @@ -661,29 +899,44 @@ template inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count); template inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); +inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level); template inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); template inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); template +inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); +template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL); template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL); +template +inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); +template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); template inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index); template -inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size); +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size); +template +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); template inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template -inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); template inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); template +inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); +template +inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template +inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); +template inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); #endif @@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N) class meshopt_Allocator { public: - template - struct StorageT + struct Storage { - static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t); - static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*); + void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t); + void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*); }; - typedef StorageT Storage; +#ifdef MESHOPTIMIZER_ALLOC_EXPORT + MESHOPTIMIZER_API static Storage& storage(); +#else + static Storage& storage() + { + static Storage s = {::operator new, ::operator delete }; + return s; + } +#endif meshopt_Allocator() - : blocks() - , count(0) + : blocks() + , count(0) { } ~meshopt_Allocator() { for (size_t i = count; i > 0; --i) - Storage::deallocate(blocks[i - 1]); + storage().deallocate(blocks[i - 1]); } - template T* allocate(size_t size) + template + T* allocate(size_t size) { assert(count < sizeof(blocks) / sizeof(blocks[0])); - T* result = static_cast(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T))); + T* result = static_cast(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T))); blocks[count++] = result; return result; } @@ -749,7 +1010,7 @@ public: void deallocate(void* ptr) { assert(count > 0 && blocks[count - 1] == ptr); - Storage::deallocate(ptr); + storage().deallocate(ptr); count--; } @@ -757,10 +1018,6 @@ private: void* blocks[24]; size_t count; }; - -// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker -template void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::allocate)(size_t) = operator new; -template void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::deallocate)(void*) = operator delete; #endif /* Inline implementation for C++ templated wrappers */ @@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter { size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int); - data = static_cast(meshopt_Allocator::Storage::allocate(size)); + data = static_cast(meshopt_Allocator::storage().allocate(size)); if (input) { @@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter result[i] = T(data[i]); } - meshopt_Allocator::Storage::deallocate(data); + meshopt_Allocator::storage().deallocate(data); } }; @@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count); } +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback) +{ + struct Call + { + static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; } + }; + + return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback); +} + +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback) +{ + struct Call + { + static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; } + }; + + meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0); + + return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback); +} + template inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap) { @@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } +template +inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + size_t bound = vertex_count + (index_count / 3); + assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T + (void)bound; + + return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count); +} + template inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count) { @@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size); } +inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level) +{ + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1); +} + template inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error) { @@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error); } +template +inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error) +{ + meshopt_IndexAdapter inout(indices, indices, index_count); + + return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error); +} + template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error) { meshopt_IndexAdapter in(NULL, indices, index_count); meshopt_IndexAdapter out(destination, NULL, index_count); - return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error); + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error); +} + +template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error); +} + +template +inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error); } template @@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_ } template -inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size) +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size) { meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size); + return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size); +} + +template +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); } template @@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size } template -inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); + return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } template @@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles); } +template +inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor); +} + +template +inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight); +} + template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { @@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } +template +inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size) +{ + meshopt_IndexAdapter in(NULL, cluster_indices, total_index_count); + + return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size); +} + template inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { @@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_ #endif /** - * Copyright (c) 2016-2024 Arseny Kapoulkine + * Copyright (c) 2016-2025 Arseny Kapoulkine * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp index cc22dbcff..682b924a9 100644 --- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp +++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp @@ -10,24 +10,24 @@ namespace meshopt { -static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) +static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); float mesh_centroid[3] = {}; - for (size_t i = 0; i < index_count; ++i) + for (size_t i = 0; i < vertex_count; ++i) { - const float* p = vertex_positions + vertex_stride_float * indices[i]; + const float* p = vertex_positions + vertex_stride_float * i; mesh_centroid[0] += p[0]; mesh_centroid[1] += p[1]; mesh_centroid[2] += p[2]; } - mesh_centroid[0] /= index_count; - mesh_centroid[1] /= index_count; - mesh_centroid[2] /= index_count; + mesh_centroid[0] /= float(vertex_count); + mesh_centroid[1] /= float(vertex_count); + mesh_centroid[2] /= float(vertex_count); for (size_t cluster = 0; cluster < cluster_count; ++cluster) { @@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind // fill sort data float* sort_data = allocator.allocate(cluster_count); - calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count); + calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count); // sort clusters using sort data unsigned short* sort_keys = allocator.allocate(cluster_count); diff --git a/Source/ThirdParty/meshoptimizer/partition.cpp b/Source/ThirdParty/meshoptimizer/partition.cpp new file mode 100644 index 000000000..4119a53ed --- /dev/null +++ b/Source/ThirdParty/meshoptimizer/partition.cpp @@ -0,0 +1,624 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include +#include +#include + +// This work is based on: +// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991 +namespace meshopt +{ + +// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth +const int kMergeDepthCutoff = 40; + +struct ClusterAdjacency +{ + unsigned int* offsets; + unsigned int* clusters; + unsigned int* shared; +}; + +static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count) +{ + (void)vertex_count; + (void)total_index_count; + + size_t cluster_start = 0; + size_t cluster_write = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + offsets[i] = unsigned(cluster_write); + + // copy cluster indices, skipping duplicates + for (size_t j = 0; j < cluster_index_counts[i]; ++j) + { + unsigned int v = cluster_indices[cluster_start + j]; + assert(v < vertex_count); + + data[cluster_write] = v; + cluster_write += 1 - used[v]; + used[v] = 1; + } + + // reset used flags for the next cluster + for (size_t j = offsets[i]; j < cluster_write; ++j) + used[data[j]] = 0; + + cluster_start += cluster_index_counts[i]; + } + + assert(cluster_start == total_index_count); + assert(cluster_write <= total_index_count); + offsets[cluster_count] = unsigned(cluster_write); +} + +static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center) +{ + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + float center[3] = {0, 0, 0}; + + // approximate center of the cluster by averaging all vertex positions + for (size_t j = 0; j < index_count; ++j) + { + const float* p = vertex_positions + indices[j] * vertex_stride_float; + + center[0] += p[0]; + center[1] += p[1]; + center[2] += p[2]; + } + + // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes + if (index_count) + { + center[0] /= float(index_count); + center[1] /= float(index_count); + center[2] /= float(index_count); + } + + // compute radius of the bounding sphere for each cluster + float radiussq = 0; + + for (size_t j = 0; j < index_count; ++j) + { + const float* p = vertex_positions + indices[j] * vertex_stride_float; + + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + + radiussq = radiussq < d2 ? d2 : radiussq; + } + + memcpy(out_center, center, sizeof(center)); + return sqrtf(radiussq); +} + +static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + unsigned int* ref_offsets = allocator.allocate(vertex_count + 1); + + // compute number of clusters referenced by each vertex + memset(ref_offsets, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < cluster_count; ++i) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + ref_offsets[cluster_indices[j]]++; + } + + // compute (worst-case) number of adjacent clusters for each cluster + size_t total_adjacency = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + size_t count = 0; + + // worst case is every vertex has a disjoint cluster list + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + count += ref_offsets[cluster_indices[j]] - 1; + + // ... but only every other cluster can be adjacent in the end + total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1; + } + + // we can now allocate adjacency buffers + adjacency.offsets = allocator.allocate(cluster_count + 1); + adjacency.clusters = allocator.allocate(total_adjacency); + adjacency.shared = allocator.allocate(total_adjacency); + + // convert ref counts to offsets + size_t total_refs = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + size_t count = ref_offsets[i]; + ref_offsets[i] = unsigned(total_refs); + total_refs += count; + } + + unsigned int* ref_data = allocator.allocate(total_refs); + + // fill cluster refs for each vertex + for (size_t i = 0; i < cluster_count; ++i) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i); + } + + // after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start + memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int)); + ref_offsets[0] = 0; + + // fill cluster adjacency for each cluster... + adjacency.offsets[0] = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + unsigned int* adj = adjacency.clusters + adjacency.offsets[i]; + unsigned int* shd = adjacency.shared + adjacency.offsets[i]; + size_t count = 0; + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + + // merge the entire cluster list of each vertex into current list + for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k) + { + unsigned int c = ref_data[k]; + assert(c < cluster_count); + + if (c == unsigned(i)) + continue; + + // if the cluster is already in the list, increment the shared count + bool found = false; + for (size_t l = 0; l < count; ++l) + if (adj[l] == c) + { + found = true; + shd[l]++; + break; + } + + // .. or append a new cluster + if (!found) + { + adj[count] = c; + shd[count] = 1; + count++; + } + } + } + + // mark the end of the adjacency list; the next cluster will start there as well + adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count); + } + + assert(adjacency.offsets[cluster_count] <= total_adjacency); + + // ref_offsets can't be deallocated as it was allocated before adjacency + allocator.deallocate(ref_data); +} + +struct ClusterGroup +{ + int group; + int next; + unsigned int size; // 0 unless root + unsigned int vertices; + + float center[3]; + float radius; +}; + +struct GroupOrder +{ + unsigned int id; + int order; +}; + +static void heapPush(GroupOrder* heap, size_t size, GroupOrder item) +{ + // insert a new element at the end (breaks heap invariant) + heap[size++] = item; + + // bubble up the new element to its correct position + size_t i = size - 1; + while (i > 0 && heap[i].order < heap[(i - 1) / 2].order) + { + size_t p = (i - 1) / 2; + + GroupOrder temp = heap[i]; + heap[i] = heap[p]; + heap[p] = temp; + i = p; + } +} + +static GroupOrder heapPop(GroupOrder* heap, size_t size) +{ + assert(size > 0); + GroupOrder top = heap[0]; + + // move the last element to the top (breaks heap invariant) + heap[0] = heap[--size]; + + // bubble down the new top element to its correct position + size_t i = 0; + while (i * 2 + 1 < size) + { + // find the smallest child + size_t j = i * 2 + 1; + j += (j + 1 < size && heap[j + 1].order < heap[j].order); + + // if the parent is already smaller than both children, we're done + if (heap[j].order >= heap[i].order) + break; + + // otherwise, swap the parent and child and continue + GroupOrder temp = heap[i]; + heap[i] = heap[j]; + heap[j] = temp; + i = j; + } + + return top; +} + +static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency) +{ + unsigned int total = 0; + + for (int i1 = group1; i1 >= 0; i1 = groups[i1].next) + for (int i2 = group2; i2 >= 0; i2 = groups[i2].next) + { + for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj) + if (adjacency.clusters[adj] == unsigned(i2)) + { + total += adjacency.shared[adj]; + break; + } + } + + return total; +} + +static void mergeBounds(ClusterGroup& target, const ClusterGroup& source) +{ + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; + float d = sqrtf(dx * dx + dy * dy + dz * dz); + + if (d + r1 < r2) + { + target.center[0] = source.center[0]; + target.center[1] = source.center[1]; + target.center[2] = source.center[2]; + target.radius = source.radius; + return; + } + + if (d + r2 > r1) + { + float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f; + + target.center[0] += dx * k; + target.center[1] += dy * k; + target.center[2] += dz * k; + target.radius = (d + r2 + r1) / 2; + } +} + +static float boundsScore(const ClusterGroup& target, const ClusterGroup& source) +{ + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; + float d = sqrtf(dx * dx + dy * dy + dz * dz); + + float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2); + + return mr > 0 ? r1 / mr : 0.f; +} + +static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds) +{ + assert(groups[id].size > 0); + + float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices))); + + int best_group = -1; + float best_score = 0; + + for (int ci = id; ci >= 0; ci = groups[ci].next) + { + for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj) + { + int other = groups[adjacency.clusters[adj]].group; + if (other < 0) + continue; + + assert(groups[other].size > 0); + if (groups[id].size + groups[other].size > max_partition_size) + continue; + + unsigned int shared = countShared(groups, id, other, adjacency); + float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices))); + + // normalize shared count by the expected boundary of each group (+ keeps scoring symmetric) + float score = float(int(shared)) * (group_rsqrt + other_rsqrt); + + // incorporate spatial score to favor merging nearby groups + if (use_bounds) + score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]); + + if (score > best_score) + { + best_group = other; + best_score = score; + } + } + } + + return best_group; +} + +static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size) +{ + for (size_t i = 0; i < count; ++i) + { + unsigned int id = order[i]; + if (groups[id].size == 0 || groups[id].size >= target_partition_size) + continue; + + float best_score = -1.f; + int best_group = -1; + + for (size_t j = 0; j < count; ++j) + { + unsigned int other = order[j]; + if (id == other || groups[other].size == 0) + continue; + + if (groups[id].size + groups[other].size > max_partition_size) + continue; + + // favor merging nearby groups + float score = boundsScore(groups[id], groups[other]); + + if (score > best_score) + { + best_score = score; + best_group = other; + } + } + + // merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target + if (best_group != -1) + { + // combine groups by linking them together + unsigned int tail = best_group; + while (groups[tail].next >= 0) + tail = groups[tail].next; + + groups[tail].next = id; + + // update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge + groups[best_group].size += groups[id].size; + groups[id].size = 0; + + // merge bounding spheres + mergeBounds(groups[best_group], groups[id]); + groups[id].radius = 0.f; + } + } +} + +static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot) +{ + size_t m = 0; + + // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot + for (size_t i = 0; i < count; ++i) + { + float v = groups[order[i]].center[axis]; + + // swap(m, i) unconditionally + unsigned int t = order[m]; + order[m] = order[i]; + order[i] = t; + + // when v >= pivot, we swap i with m without advancing it, preserving invariants + m += v < pivot; + } + + return m; +} + +static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth) +{ + size_t total = 0; + for (size_t i = 0; i < count; ++i) + total += groups[order[i]].size; + + if (total <= max_partition_size || count <= leaf_size) + return mergeLeaf(groups, order, count, target_partition_size, max_partition_size); + + float mean[3] = {}; + float vars[3] = {}; + float runc = 1, runs = 1; + + // gather statistics on the points in the subtree using Welford's algorithm + for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc) + { + const float* point = groups[order[i]].center; + + for (int k = 0; k < 3; ++k) + { + float delta = point[k] - mean[k]; + mean[k] += delta * runs; + vars[k] += delta * (point[k] - mean[k]); + } + } + + // split axis is one where the variance is largest + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + + float split = mean[axis]; + size_t middle = mergePartition(order, count, groups, axis, split); + + // enforce balance for degenerate partitions + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff) + middle = count / 2; + + // recursion depth is logarithmic and bounded due to max depth check above + mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1); + mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1); +} + +} // namespace meshopt + +size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size) +{ + using namespace meshopt; + + assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_partition_size > 0); + + size_t max_partition_size = target_partition_size + target_partition_size / 3; + + meshopt_Allocator allocator; + + unsigned char* used = allocator.allocate(vertex_count); + memset(used, 0, vertex_count); + + unsigned int* cluster_newindices = allocator.allocate(total_index_count); + unsigned int* cluster_offsets = allocator.allocate(cluster_count + 1); + + // make new cluster index list that filters out duplicate indices + filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count); + cluster_indices = cluster_newindices; + + // build cluster adjacency along with edge weights (shared vertex count) + ClusterAdjacency adjacency = {}; + buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator); + + ClusterGroup* groups = allocator.allocate(cluster_count); + memset(groups, 0, sizeof(ClusterGroup) * cluster_count); + + GroupOrder* order = allocator.allocate(cluster_count); + size_t pending = 0; + + // create a singleton group for each cluster and order them by priority + for (size_t i = 0; i < cluster_count; ++i) + { + groups[i].group = int(i); + groups[i].next = -1; + groups[i].size = 1; + groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i]; + assert(groups[i].vertices > 0); + + // compute bounding sphere for each cluster if positions are provided + if (vertex_positions) + groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center); + + GroupOrder item = {}; + item.id = unsigned(i); + item.order = groups[i].vertices; + + heapPush(order, pending++, item); + } + + // iteratively merge the smallest group with the best group + while (pending) + { + GroupOrder top = heapPop(order, pending--); + + // this group was merged into another group earlier + if (groups[top.id].size == 0) + continue; + + // disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted + for (int i = top.id; i >= 0; i = groups[i].next) + { + assert(groups[i].group == int(top.id)); + groups[i].group = -1; + } + + // the group is large enough, emit as is + if (groups[top.id].size >= target_partition_size) + continue; + + int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions); + + // we can't grow the group any more, emit as is + if (best_group == -1) + continue; + + // compute shared vertices to adjust the total vertices estimate after merging + unsigned int shared = countShared(groups, top.id, best_group, adjacency); + + // combine groups by linking them together + unsigned int tail = top.id; + while (groups[tail].next >= 0) + tail = groups[tail].next; + + groups[tail].next = best_group; + + // update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size + groups[top.id].size += groups[best_group].size; + groups[top.id].vertices += groups[best_group].vertices; + groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1; + + groups[best_group].size = 0; + groups[best_group].vertices = 0; + + // merge bounding spheres if bounds are available + if (vertex_positions) + { + mergeBounds(groups[top.id], groups[best_group]); + groups[best_group].radius = 0; + } + + // re-associate all clusters back to the merged group + for (int i = top.id; i >= 0; i = groups[i].next) + groups[i].group = int(top.id); + + top.order = groups[top.id].vertices; + heapPush(order, pending++, top); + } + + // if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone + if (vertex_positions) + { + unsigned int* merge_order = reinterpret_cast(order); + size_t merge_offset = 0; + + for (size_t i = 0; i < cluster_count; ++i) + if (groups[i].size) + merge_order[merge_offset++] = unsigned(i); + + mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0); + } + + // output each remaining group + size_t next_group = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + if (groups[i].size == 0) + continue; + + for (int j = int(i); j >= 0; j = groups[j].next) + destination[j] = unsigned(next_group); + + next_group++; + } + + assert(next_group <= cluster_count); + return next_group; +} diff --git a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp b/Source/ThirdParty/meshoptimizer/rasterizer.cpp similarity index 62% rename from Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp rename to Source/ThirdParty/meshoptimizer/rasterizer.cpp index 31cf6f146..bd788ffdb 100644 --- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp +++ b/Source/ThirdParty/meshoptimizer/rasterizer.cpp @@ -18,14 +18,6 @@ struct OverdrawBuffer unsigned int overdraw[kViewport][kViewport][2]; }; -#ifndef min -#define min(a, b) ((a) < (b) ? (a) : (b)) -#endif - -#ifndef max -#define max(a, b) ((a) > (b) ? (a) : (b)) -#endif - static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3) { // z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1) @@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1); float invdet = (det == 0) ? 0 : 1 / det; - dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet; - dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet; + dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet; + dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet; return det; } @@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f // bounding rectangle, clipped against viewport // since we rasterize pixels with covered centers, min >0.5 should round up // as for max, due to top-left filling convention we will never rasterize right/bottom edges - // so max >= 0.5 should round down - int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0); - int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport); - int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0); - int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport); + // so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case) + int minx = X1 < X2 ? X1 : X2; + minx = minx < X3 ? minx : X3; + minx = (minx + 7) >> 4; + minx = minx < 0 ? 0 : minx; + + int miny = Y1 < Y2 ? Y1 : Y2; + miny = miny < Y3 ? miny : Y3; + miny = (miny + 7) >> 4; + miny = miny < 0 ? 0 : miny; + + int maxx = X1 > X2 ? X1 : X2; + maxx = maxx > X3 ? maxx : X3; + maxx = (maxx + 7) >> 4; + maxx = maxx > kViewport ? kViewport : maxx; + + int maxy = Y1 > Y2 ? Y1 : Y2; + maxy = maxy > Y3 ? maxy : Y3; + maxy = (maxy + 7) >> 4; + maxy = maxy > kViewport ? kViewport : maxy; // deltas, 28.4 fixed point int DX12 = X1 - X2; @@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f } } -} // namespace meshopt - -meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - meshopt_Allocator allocator; - size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - meshopt_OverdrawStatistics result = {}; - float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; @@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, for (int j = 0; j < 3; ++j) { - minv[j] = min(minv[j], v[j]); - maxv[j] = max(maxv[j], v[j]); + float vj = v[j]; + + minv[j] = minv[j] > vj ? vj : minv[j]; + maxv[j] = maxv[j] < vj ? vj : maxv[j]; } } - float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2])); - float scale = kViewport / extent; + float extent = 0.f; - float* triangles = allocator.allocate(index_count * 3); + extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]); + extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); + extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); + + float scale = kViewport / extent; for (size_t i = 0; i < index_count; ++i) { @@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, triangles[i * 3 + 2] = (v[2] - minv[2]) * scale; } + return extent; +} + +static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis) +{ + for (size_t i = 0; i < index_count; i += 3) + { + const float* vn0 = &triangles[3 * (i + 0)]; + const float* vn1 = &triangles[3 * (i + 1)]; + const float* vn2 = &triangles[3 * (i + 2)]; + + switch (axis) + { + case 0: + rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]); + break; + case 1: + rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]); + break; + case 2: + rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]); + break; + } + } +} + +} // namespace meshopt + +meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + meshopt_OverdrawStatistics result = {}; + + float* triangles = allocator.allocate(index_count * 3); + transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); + OverdrawBuffer* buffer = allocator.allocate(1); for (int axis = 0; axis < 3; ++axis) { memset(buffer, 0, sizeof(OverdrawBuffer)); - - for (size_t i = 0; i < index_count; i += 3) - { - const float* vn0 = &triangles[3 * (i + 0)]; - const float* vn1 = &triangles[3 * (i + 1)]; - const float* vn2 = &triangles[3 * (i + 2)]; - - switch (axis) - { - case 0: - rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]); - break; - case 1: - rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]); - break; - case 2: - rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]); - break; - } - } + rasterizeTriangles(buffer, triangles, index_count, axis); for (int y = 0; y < kViewport; ++y) for (int x = 0; x < kViewport; ++x) @@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, return result; } + +meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + meshopt_CoverageStatistics result = {}; + + float* triangles = allocator.allocate(index_count * 3); + float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); + + OverdrawBuffer* buffer = allocator.allocate(1); + + for (int axis = 0; axis < 3; ++axis) + { + memset(buffer, 0, sizeof(OverdrawBuffer)); + rasterizeTriangles(buffer, triangles, index_count, axis); + + unsigned int covered = 0; + + for (int y = 0; y < kViewport; ++y) + for (int x = 0; x < kViewport; ++x) + covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0; + + result.coverage[axis] = float(covered) / float(kViewport * kViewport); + } + + result.extent = extent; + + return result; +} diff --git a/Source/ThirdParty/meshoptimizer/simplifier.cpp b/Source/ThirdParty/meshoptimizer/simplifier.cpp index e59b4afcd..14d4d42fe 100644 --- a/Source/ThirdParty/meshoptimizer/simplifier.cpp +++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp @@ -27,6 +27,7 @@ // Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 // Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019 // Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999 +// Hugues Hoppe, Steve Marschner. Efficient Minimization of New Quadric Metric for Simplifying Meshes with Appearance Attributes. 2000 namespace meshopt { @@ -118,10 +119,17 @@ struct PositionHasher unsigned int ri = sparse_remap ? sparse_remap[index] : index; const unsigned int* key = reinterpret_cast(vertex_positions + ri * vertex_stride_float); + unsigned int x = key[0], y = key[1], z = key[2]; + + // replace negative zero with zero + x = (x == 0x80000000) ? 0 : x; + y = (y == 0x80000000) ? 0 : y; + z = (z == 0x80000000) ? 0 : z; + // scramble bits to make sure that integer coordinates have entropy in lower bits - unsigned int x = key[0] ^ (key[0] >> 17); - unsigned int y = key[1] ^ (key[1] >> 17); - unsigned int z = key[2] ^ (key[2] >> 17); + x ^= x >> 17; + y ^= y >> 17; + z ^= z >> 17; // Optimized Spatial Hashing for Collision Detection of Deformable Objects return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791); @@ -132,7 +140,10 @@ struct PositionHasher unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs; unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs; - return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0; + const float* lv = vertex_positions + li * vertex_stride_float; + const float* rv = vertex_positions + ri * vertex_stride_float; + + return lv[0] == rv[0] && lv[1] == rv[1] && lv[2] == rv[2]; } }; @@ -208,6 +219,11 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f remap[index] = *entry; } + allocator.deallocate(table); + + if (!wedge) + return; + // build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex? // entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i for (size_t i = 0; i < vertex_count; ++i) @@ -221,22 +237,24 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f wedge[i] = wedge[r]; wedge[r] = unsigned(i); } - - allocator.deallocate(table); } static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator) { // use a bit set to compute the precise number of unique vertices unsigned char* filter = allocator.allocate((vertex_count + 7) / 8); - memset(filter, 0, (vertex_count + 7) / 8); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + filter[index / 8] = 0; + } size_t unique = 0; for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - assert(index < vertex_count); - unique += (filter[index / 8] & (1 << (index % 8))) == 0; filter[index / 8] |= 1 << (index % 8); } @@ -255,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u); if (*entry == ~0u) @@ -288,14 +305,14 @@ enum VertexKind }; // manifold vertices can collapse onto anything -// border/seam vertices can only be collapsed onto border/seam respectively +// border/seam vertices can collapse onto border/seam respectively, or locked // complex vertices can collapse onto complex/locked // a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex // for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore const unsigned char kCanCollapse[Kind_Count][Kind_Count] = { {1, 1, 1, 1, 1}, - {0, 1, 0, 0, 0}, - {0, 0, 1, 0, 0}, + {0, 1, 0, 0, 1}, + {0, 0, 1, 0, 1}, {0, 0, 0, 1, 1}, {0, 0, 0, 0, 0}, }; @@ -303,11 +320,13 @@ const unsigned char kCanCollapse[Kind_Count][Kind_Count] = { // if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge // note that for seam edges, the opposite edge isn't present in the attribute-based topology // but is present if you consider a position-only mesh variant +// while many complex collapses have the opposite edge, since complex vertices collapse to the +// same wedge, keeping opposite edges separate improves the quality by considering both targets const unsigned char kHasOpposite[Kind_Count][Kind_Count] = { - {1, 1, 1, 0, 1}, + {1, 1, 1, 1, 1}, {1, 0, 1, 0, 0}, {1, 1, 1, 0, 1}, - {0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0}, {1, 0, 1, 0, 0}, }; @@ -323,14 +342,33 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int return false; } +static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b, const unsigned int* remap, const unsigned int* wedge) +{ + unsigned int v = a; + + do + { + unsigned int count = adjacency.offsets[v + 1] - adjacency.offsets[v]; + const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[v]; + + for (size_t i = 0; i < count; ++i) + if (remap[edges[i].next] == remap[b]) + return true; + + v = wedge[v]; + } while (v != a); + + return false; +} + static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options) { memset(loop, -1, vertex_count * sizeof(unsigned int)); memset(loopback, -1, vertex_count * sizeof(unsigned int)); // incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1 - // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam - // but here it's okay to fill the data out for other types of vertices as well + // note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default + // in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices unsigned int* openinc = loopback; unsigned int* openout = loop; @@ -369,12 +407,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned { if (remap[i] == i) { - if (vertex_lock && vertex_lock[sparse_remap ? sparse_remap[i] : i]) - { - // vertex is explicitly locked - result[i] = Kind_Locked; - } - else if (wedge[i] == i) + if (wedge[i] == i) { // no attribute seam, need to check if it's manifold unsigned int openi = openinc[i], openo = openout[i]; @@ -386,6 +419,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned { result[i] = Kind_Manifold; } + else if (openi != ~0u && openo != ~0u && remap[openi] == remap[openo] && openi != i) + { + // classify half-seams as seams (the branch below would mis-classify them as borders) + // half-seam is a single vertex that connects to both vertices of a potential seam + // treating these as seams allows collapsing the "full" seam vertex onto them + result[i] = Kind_Seam; + } else if (openi != i && openo != i) { result[i] = Kind_Border; @@ -407,7 +447,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned if (openiv != ~0u && openiv != i && openov != ~0u && openov != i && openiw != ~0u && openiw != w && openow != ~0u && openow != w) { - if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw]) + if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw] && remap[openiv] != remap[openov]) { result[i] = Kind_Seam; } @@ -438,6 +478,58 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned } } + if (options & meshopt_SimplifyPermissive) + for (size_t i = 0; i < vertex_count; ++i) + if (result[i] == Kind_Seam || result[i] == Kind_Locked) + { + if (remap[i] != i) + { + // only process primary vertices; wedges will be updated to match the primary vertex + result[i] = result[remap[i]]; + continue; + } + + bool protect = false; + + // vertex_lock may protect any wedge, not just the primary vertex, so we switch to complex only if no wedges are protected + unsigned int v = unsigned(i); + do + { + unsigned int rv = sparse_remap ? sparse_remap[v] : v; + protect |= vertex_lock && (vertex_lock[rv] & meshopt_SimplifyVertex_Protect) != 0; + v = wedge[v]; + } while (v != i); + + // protect if any adjoining edge doesn't have an opposite edge (indicating vertex is on the border) + do + { + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[v]]; + size_t count = adjacency.offsets[v + 1] - adjacency.offsets[v]; + + for (size_t j = 0; j < count; ++j) + protect |= !hasEdge(adjacency, edges[j].next, v, remap, wedge); + v = wedge[v]; + } while (v != i); + + result[i] = protect ? result[i] : int(Kind_Complex); + } + + if (vertex_lock) + { + // vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); + + if (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) + result[remap[i]] = Kind_Locked; + } + + for (size_t i = 0; i < vertex_count; ++i) + if (result[remap[i]] == Kind_Locked) + result[i] = Kind_Locked; + } + if (options & meshopt_SimplifyLockBorder) for (size_t i = 0; i < vertex_count; ++i) if (result[i] == Kind_Border) @@ -454,7 +546,7 @@ struct Vector3 float x, y, z; }; -static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL) +static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL, float* out_offset = NULL) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -500,10 +592,17 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat } } + if (out_offset) + { + out_offset[0] = minv[0]; + out_offset[1] = minv[1]; + out_offset[2] = minv[2]; + } + return extent; } -static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* sparse_remap) +static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* attribute_remap, const unsigned int* sparse_remap) { size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); @@ -513,18 +612,61 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data for (size_t k = 0; k < attribute_count; ++k) { - float a = vertex_attributes_data[ri * vertex_attributes_stride_float + k]; + unsigned int rk = attribute_remap[k]; + float a = vertex_attributes_data[ri * vertex_attributes_stride_float + rk]; - result[i * attribute_count + k] = a * attribute_weights[k]; + result[i * attribute_count + k] = a * attribute_weights[rk]; } } } -static const size_t kMaxAttributes = 16; +static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock) +{ + size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float); + size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); + + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); + + // updating externally locked vertices is not allowed + if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0) + continue; + + // moving locked vertices may result in floating point drift + if (vertex_kind[i] != Kind_Locked) + { + const Vector3& p = vertex_positions[i]; + float* v = vertex_positions_data + ri * vertex_positions_stride_float; + + v[0] = p.x * vertex_scale + vertex_offset[0]; + v[1] = p.y * vertex_scale + vertex_offset[1]; + v[2] = p.z * vertex_scale + vertex_offset[2]; + } + + if (attribute_count) + { + const float* sa = vertex_attributes + i * attribute_count; + float* va = vertex_attributes_data + ri * vertex_attributes_stride_float; + + for (size_t k = 0; k < attribute_count; ++k) + { + unsigned int rk = attribute_remap[k]; + + va[rk] = sa[k] / attribute_weights[rk]; + } + } + } +} + +static const size_t kMaxAttributes = 32; struct Quadric { - // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c + // a00*x^2 + a11*y^2 + a22*z^2 + 2*a10*xy + 2*a20*xz + 2*a21*yz + 2*b0*x + 2*b1*y + 2*b2*z + c float a00, a11, a22; float a10, a20, a21; float b0, b1, b2, c; @@ -586,6 +728,14 @@ static void quadricAdd(Quadric& Q, const Quadric& R) Q.w += R.w; } +static void quadricAdd(QuadricGrad& G, const QuadricGrad& R) +{ + G.gx += R.gx; + G.gy += R.gy; + G.gz += R.gz; + G.gw += R.gw; +} + static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count) { for (size_t k = 0; k < attribute_count; ++k) @@ -597,7 +747,7 @@ static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_co } } -static float quadricError(const Quadric& Q, const Vector3& v) +static float quadricEval(const Quadric& Q, const Vector3& v) { float rx = Q.b0; float ry = Q.b1; @@ -620,6 +770,12 @@ static float quadricError(const Quadric& Q, const Vector3& v) r += ry * v.y; r += rz * v.z; + return r; +} + +static float quadricError(const Quadric& Q, const Vector3& v) +{ + float r = quadricEval(Q, v); float s = Q.w == 0.f ? 0.f : 1.f / Q.w; return fabsf(r) * s; @@ -627,26 +783,7 @@ static float quadricError(const Quadric& Q, const Vector3& v) static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va) { - float rx = Q.b0; - float ry = Q.b1; - float rz = Q.b2; - - rx += Q.a10 * v.y; - ry += Q.a21 * v.z; - rz += Q.a20 * v.x; - - rx *= 2; - ry *= 2; - rz *= 2; - - rx += Q.a00 * v.x; - ry += Q.a11 * v.y; - rz += Q.a22 * v.z; - - float r = Q.c; - r += rx * v.x; - r += ry * v.y; - r += rz * v.z; + float r = quadricEval(Q, v); // see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr for (size_t k = 0; k < attribute_count; ++k) @@ -654,14 +791,11 @@ static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribu float a = va[k]; float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw; - r += a * a * Q.w; - r -= 2 * a * g; + r += a * (a * Q.w - 2 * g); } - // TODO: weight normalization is breaking attribute error somehow - float s = 1; // Q.w == 0.f ? 0.f : 1.f / Q.w; - - return fabsf(r) * s; + // note: unlike position error, we do not normalize by Q.w to retain edge scaling as described in quadricFromAttributes + return fabsf(r); } static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w) @@ -684,6 +818,17 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo Q.w = w; } +static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w) +{ + Q.a00 = Q.a11 = Q.a22 = w; + Q.a10 = Q.a20 = Q.a21 = 0; + Q.b0 = -x * w; + Q.b1 = -y * w; + Q.b2 = -z * w; + Q.c = (x * x + y * y + z * z) * w; + Q.w = w; +} + static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) { Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; @@ -702,20 +847,24 @@ static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1 static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) { Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; - float length = normalize(p10); - // p20p = length of projection of p2-p0 onto normalize(p1 - p0) + // edge length; keep squared length around for projection correction + float lengthsq = p10.x * p10.x + p10.y * p10.y + p10.z * p10.z; + float length = sqrtf(lengthsq); + + // p20p = length of projection of p2-p0 onto p1-p0; note that p10 is unnormalized so we need to correct it later Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z; - // normal = altitude of triangle from point p2 onto edge p1-p0 - Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p}; - normalize(normal); + // perp = perpendicular vector from p2 to line segment p1-p0 + // note: since p10 is unnormalized we need to correct the projection; we scale p20 instead to take advantage of normalize below + Vector3 perp = {p20.x * lengthsq - p10.x * p20p, p20.y * lengthsq - p10.y * p20p, p20.z * lengthsq - p10.z * p20p}; + normalize(perp); - float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z; + float distance = perp.x * p0.x + perp.y * p0.y + perp.z * p0.z; // note: the weight is scaled linearly with edge length; this has to match the triangle weight - quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight); + quadricFromPlane(Q, perp.x, perp.y, perp.z, -distance, length * weight); } static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count) @@ -728,16 +877,21 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; - // weight is scaled linearly with edge length + // normal = cross(p1 - p0, p2 - p0) Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x}; - float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z); - float w = sqrtf(area); // TODO this needs more experimentation + float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z) * 0.5f; + + // quadric is weighted with the square of edge length (= area) + // this equalizes the units with the positional error (which, after normalization, is a square of distance) + // as a result, a change in weighted attribute of 1 along distance d is approximately equivalent to a change in position of d + float w = area; // we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows: // v = (d11 * d20 - d01 * d21) / denom // w = (d00 * d21 - d01 * d20) / denom // u = 1 - v - w // here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj) + // note: v2 and d20/d21 can not be evaluated here as v2 is effectively an unknown variable; we need these only as variables for derivation of gradients const Vector3& v0 = p10; const Vector3& v1 = p20; float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z; @@ -747,7 +901,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, float denomr = denom == 0 ? 0.f : 1.f / denom; // precompute gradient factors - // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes + // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out expressions that are shared between attributes float gx1 = (d11 * v0.x - d01 * v1.x) * denomr; float gx2 = (d00 * v1.x - d01 * v0.x) * denomr; float gy1 = (d11 * v0.y - d01 * v1.y) * denomr; @@ -772,6 +926,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, // quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K // since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields + // note: for simplicity we scale all factors by weight here instead of outside the loop Q.a00 += w * (gx * gx); Q.a11 += w * (gy * gy); Q.a22 += w * (gz * gz); @@ -794,7 +949,112 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, } } -static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) +static void quadricVolumeGradient(QuadricGrad& G, const Vector3& p0, const Vector3& p1, const Vector3& p2) +{ + Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; + Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; + + // normal = cross(p1 - p0, p2 - p0) + Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x}; + float area = normalize(normal) * 0.5f; + + G.gx = normal.x * area; + G.gy = normal.y * area; + G.gz = normal.z * area; + G.gw = (-p0.x * normal.x - p0.y * normal.y - p0.z * normal.z) * area; +} + +static bool quadricSolve(Vector3& p, const Quadric& Q, const QuadricGrad& GV) +{ + // solve A*p = -b where A is the quadric matrix and b is the linear term + float a00 = Q.a00, a11 = Q.a11, a22 = Q.a22; + float a10 = Q.a10, a20 = Q.a20, a21 = Q.a21; + float x0 = -Q.b0, x1 = -Q.b1, x2 = -Q.b2; + + float eps = 1e-6f * Q.w; + + // LDL decomposition: A = LDL^T + float d0 = a00; + float l10 = a10 / d0; + float l20 = a20 / d0; + + float d1 = a11 - a10 * l10; + float dl21 = a21 - a20 * l10; + float l21 = dl21 / d1; + + float d2 = a22 - a20 * l20 - dl21 * l21; + + // solve L*y = x + float y0 = x0; + float y1 = x1 - l10 * y0; + float y2 = x2 - l20 * y0 - l21 * y1; + + // solve D*z = y + float z0 = y0 / d0; + float z1 = y1 / d1; + float z2 = y2 / d2; + + // augment system with linear constraint GV using Lagrange multiplier + float a30 = GV.gx, a31 = GV.gy, a32 = GV.gz; + float x3 = -GV.gw; + + float l30 = a30 / d0; + float dl31 = a31 - a30 * l10; + float l31 = dl31 / d1; + float dl32 = a32 - a30 * l20 - dl31 * l21; + float l32 = dl32 / d2; + float d3 = 0.f - a30 * l30 - dl31 * l31 - dl32 * l32; + + float y3 = x3 - l30 * y0 - l31 * y1 - l32 * y2; + float z3 = fabsf(d3) > eps ? y3 / d3 : 0.f; // if d3 is zero, we can ignore the constraint + + // substitute L^T*p = z + float lambda = z3; + float pz = z2 - l32 * lambda; + float py = z1 - l21 * pz - l31 * lambda; + float px = z0 - l10 * py - l20 * pz - l30 * lambda; + + p.x = px; + p.y = py; + p.z = pz; + + return fabsf(d0) > eps && fabsf(d1) > eps && fabsf(d2) > eps; +} + +static void quadricReduceAttributes(Quadric& Q, const Quadric& A, const QuadricGrad* G, size_t attribute_count) +{ + // update vertex quadric with attribute quadric; multiply by vertex weight to minimize normalized error + Q.a00 += A.a00 * Q.w; + Q.a11 += A.a11 * Q.w; + Q.a22 += A.a22 * Q.w; + Q.a10 += A.a10 * Q.w; + Q.a20 += A.a20 * Q.w; + Q.a21 += A.a21 * Q.w; + Q.b0 += A.b0 * Q.w; + Q.b1 += A.b1 * Q.w; + Q.b2 += A.b2 * Q.w; + + float iaw = A.w == 0 ? 0.f : Q.w / A.w; + + // update linear system based on attribute gradients (BB^T/a) + for (size_t k = 0; k < attribute_count; ++k) + { + const QuadricGrad& g = G[k]; + + Q.a00 -= (g.gx * g.gx) * iaw; + Q.a11 -= (g.gy * g.gy) * iaw; + Q.a22 -= (g.gz * g.gz) * iaw; + Q.a10 -= (g.gx * g.gy) * iaw; + Q.a20 -= (g.gx * g.gz) * iaw; + Q.a21 -= (g.gy * g.gz) * iaw; + + Q.b0 -= (g.gx * g.gw) * iaw; + Q.b1 -= (g.gy * g.gw) * iaw; + Q.b2 -= (g.gz * g.gw) * iaw; + } +} + +static void fillFaceQuadrics(Quadric* vertex_quadrics, QuadricGrad* volume_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) { for (size_t i = 0; i < index_count; i += 3) { @@ -808,6 +1068,36 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); quadricAdd(vertex_quadrics[remap[i2]], Q); + + if (volume_gradients) + { + QuadricGrad GV; + quadricVolumeGradient(GV, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2]); + + quadricAdd(volume_gradients[remap[i0]], GV); + quadricAdd(volume_gradients[remap[i1]], GV); + quadricAdd(volume_gradients[remap[i2]], GV); + } + } +} + +static void fillVertexQuadrics(Quadric* vertex_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* remap, unsigned int options) +{ + // by default, we use a very small weight to improve triangulation and numerical stability without affecting the shape or error + float factor = (options & meshopt_SimplifyRegularize) ? 1e-1f : 1e-7f; + + for (size_t i = 0; i < vertex_count; ++i) + { + if (remap[i] != i) + continue; + + const Vector3& p = vertex_positions[i]; + float w = vertex_quadrics[i].w * factor; + + Quadric Q; + quadricFromPoint(Q, p.x, p.y, p.z, w); + + quadricAdd(vertex_quadrics[i], Q); } } @@ -837,15 +1127,11 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0) continue; - // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges - if (kHasOpposite[k0][k1] && remap[i1] > remap[i0]) - continue; - unsigned int i2 = indices[i + next[e + 1]]; // we try hard to maintain border edge geometry; seam edges can move more freely // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical - const float kEdgeWeightSeam = 1.f; + const float kEdgeWeightSeam = 0.5f; // applied twice due to opposite edges const float kEdgeWeightBorder = 10.f; float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam; @@ -853,13 +1139,20 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic Quadric Q; quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); + Quadric QT; + quadricFromTriangle(QT, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); + + // mix edge quadric with triangle quadric to stabilize collapses in both directions; both quadrics inherit edge weight so that their error is added + QT.w = 0; + quadricAdd(Q, QT); + quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); } } } -static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap) +static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count) { for (size_t i = 0; i < index_count; i += 3) { @@ -871,14 +1164,13 @@ static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attr QuadricGrad G[kMaxAttributes]; quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count); - // TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea - quadricAdd(attribute_quadrics[remap[i0]], QA); - quadricAdd(attribute_quadrics[remap[i1]], QA); - quadricAdd(attribute_quadrics[remap[i2]], QA); + quadricAdd(attribute_quadrics[i0], QA); + quadricAdd(attribute_quadrics[i1], QA); + quadricAdd(attribute_quadrics[i2], QA); - quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count); - quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count); - quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count); + quadricAdd(&attribute_gradients[i0 * attribute_count], G, attribute_count); + quadricAdd(&attribute_gradients[i1 * attribute_count], G, attribute_count); + quadricAdd(&attribute_gradients[i2 * attribute_count], G, attribute_count); } } @@ -922,6 +1214,30 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert continue; // early-out when at least one triangle flips due to a collapse + if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1)) + { +#if TRACE >= 2 + printf("edge block %d -> %d: flip welded %d %d %d\n", i0, i1, a, i0, b); +#endif + + return true; + } + } + + return false; +} + +static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0, const Vector3& v1) +{ + const Vector3& v0 = vertex_positions[i0]; + + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]]; + size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0]; + + for (size_t i = 0; i < count; ++i) + { + unsigned int a = edges[i].next, b = edges[i].prev; + if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1)) return true; } @@ -929,6 +1245,46 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert return false; } +static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0) +{ + const Vector3& v0 = vertex_positions[i0]; + + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]]; + size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0]; + + float result = 0.f; + + for (size_t i = 0; i < count; ++i) + { + unsigned int a = edges[i].next, b = edges[i].prev; + + const Vector3& va = vertex_positions[a]; + const Vector3& vb = vertex_positions[b]; + + float da = (va.x - v0.x) * (va.x - v0.x) + (va.y - v0.y) * (va.y - v0.y) + (va.z - v0.z) * (va.z - v0.z); + float db = (vb.x - v0.x) * (vb.x - v0.x) + (vb.y - v0.y) * (vb.y - v0.y) + (vb.z - v0.z) * (vb.z - v0.z); + + result = result < da ? da : result; + result = result < db ? db : result; + } + + return sqrtf(result); +} + +static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback) +{ + unsigned int r = remap[target]; + + // use loop metadata to guide complex collapses towards the correct wedge + // this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams + if (loop[v] != ~0u && remap[loop[v]] == r) + return loop[v]; + else if (loopback[v] != ~0u && remap[loopback[v]] == r) + return loopback[v]; + else + return target; +} + static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind) { size_t dual_count = 0; @@ -947,7 +1303,7 @@ static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_c return (index_count - dual_count / 2) + 3; } -static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop) +static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) { size_t collapse_count = 0; @@ -983,8 +1339,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c // two vertices are on a border or a seam, but there's no direct edge between them // this indicates that they belong to two different edge loops and we should not collapse this edge - // loop[] tracks half edges so we only need to check i0->i1 - if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) + // loop[] and loopback[] track half edges so we only need to check one of them + if ((k0 == Kind_Border || k0 == Kind_Seam) && k1 != Kind_Manifold && loop[i0] != i1) + continue; + if ((k1 == Kind_Border || k1 == Kind_Seam) && k0 != Kind_Manifold && loopback[i1] != i0) continue; // edge can be collapsed in either direction - we will pick the one with minimum error @@ -1009,7 +1367,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c return collapse_count; } -static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap) +static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) { for (size_t i = 0; i < collapse_count; ++i) { @@ -1017,40 +1375,94 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const unsigned int i0 = c.v0; unsigned int i1 = c.v1; - - // most edges are bidirectional which means we need to evaluate errors for two collapses - // to keep this code branchless we just use the same edge for unidirectional edges - unsigned int j0 = c.bidi ? i1 : i0; - unsigned int j1 = c.bidi ? i0 : i1; + bool bidi = c.bidi; float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]); - float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]); + float ej = bidi ? quadricError(vertex_quadrics[remap[i1]], vertex_positions[i0]) : FLT_MAX; + +#if TRACE >= 3 + float di = ei, dj = ej; +#endif if (attribute_count) { - ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); - ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]); + ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); + ej += bidi ? quadricError(attribute_quadrics[i1], &attribute_gradients[i1 * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]) : 0; + + // seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate + if (vertex_kind[i0] == Kind_Seam) + { + // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) + unsigned int s0 = wedge[i0]; + unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; + + assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams + assert(s1 != ~0u && remap[s1] == remap[i1]); + + // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe + s1 = (s1 != ~0u) ? s1 : wedge[i1]; + + ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]); + ej += bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0; + } + else + { + // complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target + if (vertex_kind[i0] == Kind_Complex) + for (unsigned int v = wedge[i0]; v != i0; v = wedge[v]) + { + unsigned int t = getComplexTarget(v, i1, remap, loop, loopback); + + ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]); + } + + if (vertex_kind[i1] == Kind_Complex && bidi) + for (unsigned int v = wedge[i1]; v != i1; v = wedge[v]) + { + unsigned int t = getComplexTarget(v, i0, remap, loop, loopback); + + ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]); + } + } } - // pick edge direction with minimal error - c.v0 = ei <= ej ? i0 : j0; - c.v1 = ei <= ej ? i1 : j1; - c.error = ei <= ej ? ei : ej; + // pick edge direction with minimal error (branchless) + bool rev = bidi & (ej < ei); + + c.v0 = rev ? i1 : i0; + c.v1 = rev ? i0 : i1; + c.error = ej < ei ? ej : ei; + +#if TRACE >= 3 + if (bidi) + printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n", + rev ? i1 : i0, rev ? i0 : i1, + sqrtf(rev ? ej : ei), sqrtf(rev ? dj : di), sqrtf(rev ? ej - dj : ei - di), + sqrtf(rev ? ei : ej), sqrtf(rev ? di : dj), sqrtf(rev ? ei - di : ej - dj)); + else + printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", i0, i1, sqrtf(c.error), sqrtf(di), sqrtf(ei - di)); +#endif } } static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count) { - const int sort_bits = 11; + // we use counting sort to order collapses by error; since the exact sort order is not as critical, + // only top 12 bits of exponent+mantissa (8 bits of exponent and 4 bits of mantissa) are used. + // to avoid excessive stack usage, we clamp the exponent range as collapses with errors much higher than 1 are not useful. + const unsigned int sort_bits = 12; + const unsigned int sort_bins = 2048 + 512; // exponent range [-127, 32) // fill histogram for counting sort - unsigned int histogram[1 << sort_bits]; + unsigned int histogram[sort_bins]; memset(histogram, 0, sizeof(histogram)); for (size_t i = 0; i < collapse_count; ++i) { // skip sign bit since error is non-negative - unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits); + unsigned int error = collapses[i].errorui; + unsigned int key = (error << 1) >> (32 - sort_bits); + key = key < sort_bins ? key : sort_bins - 1; histogram[key]++; } @@ -1058,7 +1470,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse // compute offsets based on histogram data size_t histogram_sum = 0; - for (size_t i = 0; i < 1 << sort_bits; ++i) + for (size_t i = 0; i < sort_bins; ++i) { size_t count = histogram[i]; histogram[i] = unsigned(histogram_sum); @@ -1071,13 +1483,15 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse for (size_t i = 0; i < collapse_count; ++i) { // skip sign bit since error is non-negative - unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits); + unsigned int error = collapses[i].errorui; + unsigned int key = (error << 1) >> (32 - sort_bits); + key = key < sort_bins ? key : sort_bins - 1; sort_order[histogram[key]++] = unsigned(i); } } -static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error) +static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error) { size_t edge_collapses = 0; size_t triangle_collapses = 0; @@ -1087,7 +1501,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* size_t edge_collapse_goal = triangle_collapse_goal / 2; #if TRACE - size_t stats[4] = {}; + size_t stats[7] = {}; #endif for (size_t i = 0; i < collapse_count; ++i) @@ -1097,10 +1511,16 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* TRACESTATS(0); if (c.error > error_limit) + { + TRACESTATS(4); break; + } if (triangle_collapses >= triangle_collapse_goal) + { + TRACESTATS(5); break; + } // we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked // as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor @@ -1108,8 +1528,11 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* // on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd // topology, we only abort if we got over 1/6 collapses accordingly. - if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 6) + if (c.error > error_goal && c.error > result_error && triangle_collapses > triangle_collapse_goal / 6) + { + TRACESTATS(6); break; + } unsigned int i0 = c.v0; unsigned int i1 = c.v1; @@ -1117,6 +1540,8 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* unsigned int r0 = remap[i0]; unsigned int r1 = remap[i1]; + unsigned char kind = vertex_kind[i0]; + // we don't collapse vertices that had source or target vertex involved in a collapse // it's important to not move the vertices twice since it complicates the tracking/remapping logic // it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass @@ -1135,35 +1560,41 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* continue; } +#if TRACE >= 2 + printf("edge commit %d -> %d: kind %d->%d, error %f\n", i0, i1, vertex_kind[i0], vertex_kind[i1], sqrtf(c.error)); +#endif + assert(collapse_remap[r0] == r0); assert(collapse_remap[r1] == r1); - quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); - - if (attribute_count) - { - quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]); - quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count); - } - - if (vertex_kind[i0] == Kind_Complex) + if (kind == Kind_Complex) { + // remap all vertices in the complex to the target vertex unsigned int v = i0; do { - collapse_remap[v] = r1; + unsigned int t = getComplexTarget(v, i1, remap, loop, loopback); + + collapse_remap[v] = t; v = wedge[v]; } while (v != i0); } - else if (vertex_kind[i0] == Kind_Seam) + else if (kind == Kind_Seam) { - // remap v0 to v1 and seam pair of v0 to seam pair of v1 + // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) unsigned int s0 = wedge[i0]; - unsigned int s1 = wedge[i1]; + unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; + assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams + assert(s1 != ~0u && remap[s1] == r1); - assert(s0 != i0 && s1 != i1); - assert(wedge[s0] == i0 && wedge[s1] == i1); + // additional asserts to verify that the seam pair is consistent + assert(kind != vertex_kind[i1] || s1 == wedge[i1]); + assert(loop[i0] == i1 || loopback[i0] == i1); + assert(loop[s0] == s1 || loopback[s0] == s1); + + // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe + s1 = (s1 != ~0u) ? s1 : wedge[i1]; collapse_remap[i0] = i1; collapse_remap[s0] = s1; @@ -1175,28 +1606,205 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_remap[i0] = i1; } + // note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used + // however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling collapse_locked[r0] = 1; collapse_locked[r1] = 1; // border edges collapse 1 triangle, other edges collapse 2 or more - triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2; + triangle_collapses += (kind == Kind_Border) ? 1 : 2; edge_collapses++; result_error = result_error < c.error ? c.error : result_error; } #if TRACE - float error_goal_perfect = edge_collapse_goal < collapse_count ? collapses[collapse_order[edge_collapse_goal]].error : 0.f; + float error_goal_last = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX; + float error_goal_limit = error_goal_last < error_limit ? error_goal_last : error_limit; - printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d)\n", - int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_perfect), - int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2])); + printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d); %s\n", + int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_limit), + int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]), + stats[4] ? "error limit" : (stats[5] ? "count limit" : (stats[6] ? "error goal" : "out of collapses"))); #endif return edge_collapses; } -static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap) +static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, QuadricGrad* volume_gradients, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error) +{ + for (size_t i = 0; i < vertex_count; ++i) + { + if (collapse_remap[i] == i) + continue; + + unsigned int i0 = unsigned(i); + unsigned int i1 = collapse_remap[i]; + + unsigned int r0 = remap[i0]; + unsigned int r1 = remap[i1]; + + // ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved + if (i0 == r0) + { + quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); + + if (volume_gradients) + quadricAdd(volume_gradients[r1], volume_gradients[r0]); + } + + if (attribute_count) + { + quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]); + quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count); + + if (i0 == r0) + { + // when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment + float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]); + vertex_error = vertex_error < derr ? derr : vertex_error; + } + } + } +} + +static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) +{ +#if TRACE + size_t stats[6] = {}; +#endif + + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds + // moving vertices on a border requires a stronger edge quadric to preserve the border geometry + if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border) + continue; + + if (remap[i] != i) + { + vertex_positions[i] = vertex_positions[remap[i]]; + continue; + } + + TRACESTATS(0); + + const Vector3& vp = vertex_positions[i]; + + Quadric Q = vertex_quadrics[i]; + QuadricGrad GV = {}; + + // add a point quadric for regularization to stabilize the solution + Quadric R; + quadricFromPoint(R, vp.x, vp.y, vp.z, Q.w * 1e-4f); + quadricAdd(Q, R); + + if (attribute_count) + { + // optimal point simultaneously minimizes attribute quadrics for all wedges + unsigned int v = unsigned(i); + do + { + quadricReduceAttributes(Q, attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count); + v = wedge[v]; + } while (v != i); + + // minimizing attribute quadrics results in volume loss so we incorporate volume gradient as a constraint + if (volume_gradients) + GV = volume_gradients[i]; + } + + Vector3 p; + if (!quadricSolve(p, Q, GV)) + { + TRACESTATS(2); + continue; + } + + // reject updates that move the vertex too far from its neighborhood + // this detects and fixes most cases when the quadric is not well-defined + float nr = getNeighborhoodRadius(adjacency, vertex_positions, unsigned(i)); + float dp = (p.x - vp.x) * (p.x - vp.x) + (p.y - vp.y) * (p.y - vp.y) + (p.z - vp.z) * (p.z - vp.z); + + if (dp > nr * nr) + { + TRACESTATS(3); + continue; + } + + // reject updates that would flip a neighboring triangle, as we do for edge collapse + if (hasTriangleFlips(adjacency, vertex_positions, unsigned(i), p)) + { + TRACESTATS(4); + continue; + } + + // reject updates that increase positional error too much; allow some tolerance to improve attribute quality + if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f) + { + TRACESTATS(5); + continue; + } + + TRACESTATS(1); + vertex_positions[i] = p; + } + +#if TRACE + printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5])); +#endif +} + +static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update) +{ + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + if (remap[i] != i) + continue; + + for (size_t k = 0; k < attribute_count; ++k) + { + unsigned int shared = ~0u; + + // for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared + if (vertex_kind[i] == Kind_Complex) + { + shared = unsigned(i); + + for (unsigned int v = wedge[i]; v != i; v = wedge[v]) + if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k]) + shared = ~0u; + else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w) + shared = v; + } + + // update attributes for all wedges + unsigned int v = unsigned(i); + do + { + unsigned int r = (shared == ~0u) ? v : shared; + + const Vector3& p = vertex_positions[i]; // same for all wedges + const Quadric& A = attribute_quadrics[r]; + const QuadricGrad& G = attribute_gradients[r * attribute_count + k]; + + float iw = A.w == 0 ? 0.f : 1.f / A.w; + float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + + vertex_attributes[v * attribute_count + k] = av; + v = wedge[v]; + } while (v != i); + } + } +} + +static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap, const unsigned int* remap) { size_t write = 0; @@ -1211,7 +1819,14 @@ static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const assert(collapse_remap[v1] == v1); assert(collapse_remap[v2] == v2); - if (v0 != v1 && v0 != v2 && v1 != v2) + // collapse zero area triangles even if they are not topologically degenerate + // this is required to cleanup manifold->seam collapses when a vertex is collapsed onto a seam pair + // as well as complex collapses and some other cases where cross wedge collapses are performed + unsigned int r0 = remap[v0]; + unsigned int r1 = remap[v1]; + unsigned int r2 = remap[v2]; + + if (r0 != r1 && r0 != r2 && r1 != r2) { indices[write + 0] = v0; indices[write + 1] = v1; @@ -1227,17 +1842,183 @@ static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsign { for (size_t i = 0; i < vertex_count; ++i) { + // note: this is a no-op for vertices that were remapped + // ideally we would clear the loop entries for those for consistency, even though they aren't going to be used + // however, the remapping process needs loop information for remapped vertices, so this would require a separate pass if (loop[i] != ~0u) { unsigned int l = loop[i]; unsigned int r = collapse_remap[l]; // i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes - loop[i] = (i == r) ? loop[l] : r; + if (i == r) + loop[i] = (loop[l] != ~0u) ? collapse_remap[loop[l]] : ~0u; + else + loop[i] = r; } } } +static unsigned int follow(unsigned int* parents, unsigned int index) +{ + while (index != parents[index]) + { + unsigned int parent = parents[index]; + parents[index] = parents[parent]; + index = parent; + } + + return index; +} + +static size_t buildComponents(unsigned int* components, size_t vertex_count, const unsigned int* indices, size_t index_count, const unsigned int* remap) +{ + for (size_t i = 0; i < vertex_count; ++i) + components[i] = unsigned(i); + + // compute a unique (but not sequential!) index for each component via union-find + for (size_t i = 0; i < index_count; i += 3) + { + static const int next[4] = {1, 2, 0, 1}; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + + unsigned int r0 = remap[i0]; + unsigned int r1 = remap[i1]; + + r0 = follow(components, r0); + r1 = follow(components, r1); + + // merge components with larger indices into components with smaller indices + // this guarantees that the root of the component is always the one with the smallest index + if (r0 != r1) + components[r0 < r1 ? r1 : r0] = r0 < r1 ? r0 : r1; + } + } + + // make sure each element points to the component root *before* we renumber the components + for (size_t i = 0; i < vertex_count; ++i) + if (remap[i] == i) + components[i] = follow(components, unsigned(i)); + + unsigned int next_component = 0; + + // renumber components using sequential indices + // a sequential pass is sufficient because component root always has the smallest index + // note: it is unsafe to use follow() in this pass because we're replacing component links with sequential indices inplace + for (size_t i = 0; i < vertex_count; ++i) + { + if (remap[i] == i) + { + unsigned int root = components[i]; + assert(root <= i); // make sure we already computed the component for non-roots + components[i] = (root == i) ? next_component++ : components[root]; + } + else + { + assert(remap[i] < i); // make sure we already computed the component + components[i] = components[remap[i]]; + } + } + + return next_component; +} + +static void measureComponents(float* component_errors, size_t component_count, const unsigned int* components, const Vector3* vertex_positions, size_t vertex_count) +{ + memset(component_errors, 0, component_count * 4 * sizeof(float)); + + // compute approximate sphere center for each component as an average + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int c = components[i]; + assert(components[i] < component_count); + + Vector3 v = vertex_positions[i]; // copy avoids aliasing issues + + component_errors[c * 4 + 0] += v.x; + component_errors[c * 4 + 1] += v.y; + component_errors[c * 4 + 2] += v.z; + component_errors[c * 4 + 3] += 1; // weight + } + + // complete the center computation, and reinitialize [3] as a radius + for (size_t i = 0; i < component_count; ++i) + { + float w = component_errors[i * 4 + 3]; + float iw = w == 0.f ? 0.f : 1.f / w; + + component_errors[i * 4 + 0] *= iw; + component_errors[i * 4 + 1] *= iw; + component_errors[i * 4 + 2] *= iw; + component_errors[i * 4 + 3] = 0; // radius + } + + // compute squared radius for each component + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int c = components[i]; + + float dx = vertex_positions[i].x - component_errors[c * 4 + 0]; + float dy = vertex_positions[i].y - component_errors[c * 4 + 1]; + float dz = vertex_positions[i].z - component_errors[c * 4 + 2]; + float r = dx * dx + dy * dy + dz * dz; + + component_errors[c * 4 + 3] = component_errors[c * 4 + 3] < r ? r : component_errors[c * 4 + 3]; + } + + // we've used the output buffer as scratch space, so we need to move the results to proper indices + for (size_t i = 0; i < component_count; ++i) + { +#if TRACE >= 2 + printf("component %d: center %f %f %f, error %e\n", int(i), + component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3])); +#endif + // note: we keep the squared error to make it match quadric error metric + component_errors[i] = component_errors[i * 4 + 3]; + } +} + +static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror) +{ + (void)component_count; + + size_t write = 0; + float min_error = FLT_MAX; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int v0 = indices[i + 0], v1 = indices[i + 1], v2 = indices[i + 2]; + unsigned int c = components[v0]; + assert(c == components[v1] && c == components[v2]); + + if (component_errors[c] > error_cutoff) + { + min_error = min_error > component_errors[c] ? component_errors[c] : min_error; + + indices[write + 0] = v0; + indices[write + 1] = v1; + indices[write + 2] = v2; + write += 3; + } + } + +#if TRACE + size_t pruned_components = 0; + for (size_t i = 0; i < component_count; ++i) + pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff); + + printf("pruned %d triangles in %d components (goal %e); next %e\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff), min_error < FLT_MAX ? sqrtf(min_error) : min_error * 2); +#endif + + // update next error with the smallest error of the remaining components + nexterror = min_error; + return write; +} + struct CellHasher { const unsigned int* vertex_ids; @@ -1299,7 +2080,7 @@ struct TriangleHasher } }; -static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size) +static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, const unsigned char* vertex_lock, size_t vertex_count, int grid_size) { assert(grid_size >= 1 && grid_size <= 1024); float cell_scale = float(grid_size - 1); @@ -1312,7 +2093,10 @@ static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_pos int yi = int(v.y * cell_scale + 0.5f); int zi = int(v.z * cell_scale + 0.5f); - vertex_ids[i] = (xi << 20) | (yi << 10) | zi; + if (vertex_lock && (vertex_lock[i] & meshopt_SimplifyVertex_Lock)) + vertex_ids[i] = (1 << 30) | unsigned(i); + else + vertex_ids[i] = (xi << 20) | (yi << 10) | zi; } } @@ -1541,17 +2325,17 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float // three point interpolation from "revenge of interpolation search" paper float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0); float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2); - return x1 + num / den; + return x1 + (den == 0.f ? 0.f : num / den); } } // namespace meshopt -#ifndef NDEBUG -// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds -MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = NULL; -MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = NULL; -MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = NULL; -#endif +// Note: this is only exposed for development purposes; do *not* use +enum +{ + meshopt_SimplifyInternalSolve = 1 << 29, + meshopt_SimplifyInternalDebug = 1 << 30 +}; size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { @@ -1561,10 +2345,13 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(target_index_count <= index_count); - assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute)) == 0); + assert(target_error >= 0); + assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyRegularize | meshopt_SimplifyPermissive | meshopt_SimplifyInternalSolve | meshopt_SimplifyInternalDebug)) == 0); assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256); assert(vertex_attributes_stride % sizeof(float) == 0); assert(attribute_count <= kMaxAttributes); + for (size_t i = 0; i < attribute_count; ++i) + assert(attribute_weights[i] >= 0); meshopt_Allocator allocator; @@ -1584,6 +2371,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL); // build position remap that maps each vertex to the one with identical position + // wedge table stores next vertex with identical position for each vertex unsigned int* remap = allocator.allocate(vertex_count); unsigned int* wedge = allocator.allocate(vertex_count); buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator); @@ -1610,14 +2398,23 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic #endif Vector3* vertex_positions = allocator.allocate(vertex_count); - float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap); + float vertex_offset[3] = {}; + float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, vertex_offset); float* vertex_attributes = NULL; + unsigned int attribute_remap[kMaxAttributes]; if (attribute_count) { + // remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics + size_t attributes_used = 0; + for (size_t i = 0; i < attribute_count; ++i) + if (attribute_weights[i] > 0) + attribute_remap[attributes_used++] = unsigned(i); + + attribute_count = attributes_used; vertex_attributes = allocator.allocate(vertex_count * attribute_count); - rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, sparse_remap); + rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, attribute_remap, sparse_remap); } Quadric* vertex_quadrics = allocator.allocate(vertex_count); @@ -1625,6 +2422,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic Quadric* attribute_quadrics = NULL; QuadricGrad* attribute_gradients = NULL; + QuadricGrad* volume_gradients = NULL; if (attribute_count) { @@ -1633,13 +2431,42 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic attribute_gradients = allocator.allocate(vertex_count * attribute_count); memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad)); + + if (options & meshopt_SimplifyInternalSolve) + { + volume_gradients = allocator.allocate(vertex_count); + memset(volume_gradients, 0, vertex_count * sizeof(QuadricGrad)); + } } - fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap); + fillFaceQuadrics(vertex_quadrics, volume_gradients, result, index_count, vertex_positions, remap); + fillVertexQuadrics(vertex_quadrics, vertex_positions, vertex_count, remap, options); fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback); if (attribute_count) - fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count, remap); + fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count); + + unsigned int* components = NULL; + float* component_errors = NULL; + size_t component_count = 0; + float component_nexterror = 0; + + if (options & meshopt_SimplifyPrune) + { + components = allocator.allocate(vertex_count); + component_count = buildComponents(components, vertex_count, result, index_count, remap); + + component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents + measureComponents(component_errors, component_count, components, vertex_positions, vertex_count); + + component_nexterror = FLT_MAX; + for (size_t i = 0; i < component_count; ++i) + component_nexterror = component_nexterror > component_errors[i] ? component_errors[i] : component_nexterror; + +#if TRACE + printf("components: %d (min error %e)\n", int(component_count), sqrtf(component_nexterror)); +#endif + } #if TRACE size_t pass_count = 0; @@ -1654,6 +2481,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic size_t result_count = index_count; float result_error = 0; + float vertex_error = 0; // target_error input is linear; we need to adjust it to match quadricError units float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f; @@ -1664,14 +2492,18 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic // note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); - size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop); + size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop, loopback); assert(edge_collapse_count <= collapse_capacity); // no edges can be collapsed any more due to topology restrictions if (edge_collapse_count == 0) break; - rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap); +#if TRACE + printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' '); +#endif + + rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback); sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count); @@ -1682,39 +2514,101 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic memset(collapse_locked, 0, vertex_count); -#if TRACE - printf("pass %d: ", int(pass_count++)); -#endif - - size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error); + size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error); // no edges can be collapsed any more due to hitting the error limit or triangle collapse limit if (collapses == 0) break; + updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error); + + // updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent + vertex_error = attribute_count == 0 ? result_error : vertex_error; + + // note: we update loops following edge collapses, but after this we might still have stale loop data + // this can happen when a triangle with a loop edge gets collapsed along a non-loop edge + // that works since a loop that points to a vertex that is no longer connected is not affecting collapse logic remapEdgeLoops(loop, vertex_count, collapse_remap); remapEdgeLoops(loopback, vertex_count, collapse_remap); - size_t new_count = remapIndexBuffer(result, result_count, collapse_remap); - assert(new_count < result_count); + result_count = remapIndexBuffer(result, result_count, collapse_remap, remap); + if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error) + result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror); + } + + // at this point, component_nexterror might be stale: component it references may have been removed through a series of edge collapses + bool component_nextstale = true; + + // we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit + while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit) + { +#if TRACE + printf("pass %d: cleanup; ", int(pass_count++)); +#endif + + float component_cutoff = component_nexterror * 1.5f < error_limit ? component_nexterror * 1.5f : error_limit; + + // track maximum error in eligible components as we are increasing resulting error + float component_maxerror = 0; + for (size_t i = 0; i < component_count; ++i) + if (component_errors[i] > component_maxerror && component_errors[i] <= component_cutoff) + component_maxerror = component_errors[i]; + + size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror); + if (new_count == result_count && !component_nextstale) + break; + + component_nextstale = false; // pruneComponents guarantees next error is up to date result_count = new_count; + result_error = result_error < component_maxerror ? component_maxerror : result_error; + vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error; } #if TRACE - printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count)); + printf("result: %d triangles, error: %e (pos %.3e); total %d passes\n", int(result_count / 3), sqrtf(result_error), sqrtf(vertex_error), int(pass_count)); #endif -#ifndef NDEBUG - if (meshopt_simplifyDebugKind) - memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count); + // if solve is requested, update input buffers destructively from internal data + if (options & meshopt_SimplifyInternalSolve) + { + unsigned char* vertex_update = collapse_locked; // reuse as scratch space + memset(vertex_update, 0, vertex_count); - if (meshopt_simplifyDebugLoop) - memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int)); + // limit quadric solve to vertices that are still used in the result + for (size_t i = 0; i < result_count; ++i) + { + unsigned int v = result[i]; - if (meshopt_simplifyDebugLoopBack) - memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int)); -#endif + // mark the vertex for finalizeVertices and root vertex for solve* + vertex_update[remap[v]] = vertex_update[v] = 1; + } + + // edge adjacency may be stale as we haven't updated it after last series of edge collapses + updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); + + solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); + + if (attribute_count) + solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update); + + finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock); + } + + // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity + if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap) + { + assert(Kind_Count <= 8 && vertex_count < (1 << 28)); // 3 bit kind, 1 bit loop + + for (size_t i = 0; i < result_count; i += 3) + { + unsigned int a = result[i + 0], b = result[i + 1], c = result[i + 2]; + + result[i + 0] |= (vertex_kind[a] << 28) | (unsigned(loop[a] == b || loopback[b] == a) << 31); + result[i + 1] |= (vertex_kind[b] << 28) | (unsigned(loop[b] == c || loopback[c] == b) << 31); + result[i + 2] |= (vertex_kind[c] << 28) | (unsigned(loop[c] == a || loopback[a] == c) << 31); + } + } // convert resulting indices back into the dense space of the larger mesh if (sparse_remap) @@ -1730,15 +2624,24 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { + assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error); } size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { + assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error); } -size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error) +size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) +{ + return meshopt_simplifyEdge(indices, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options | meshopt_SimplifyInternalSolve, out_result_error); +} + +size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* out_result_error) { using namespace meshopt; @@ -1766,15 +2669,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind const int kInterpolationPasses = 5; // invariant: # of triangles in min_grid <= target_count - int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error)); + int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : (target_error < 1.f ? target_error : 1.f))); int max_grid = 1025; size_t min_triangles = 0; size_t max_triangles = index_count / 3; // when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid - if (min_grid > 1) + if (min_grid > 1 || vertex_lock) { - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid); min_triangles = countTriangles(vertex_ids, indices, index_count); } @@ -1790,7 +2693,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind int grid_size = next_grid_size; grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, grid_size); size_t triangles = countTriangles(vertex_ids, indices, index_count); #if TRACE @@ -1800,7 +2703,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind (triangles <= target_index_count / 3) ? "under" : "over"); #endif - float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles)); + float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles)); if (triangles <= target_index_count / 3) { @@ -1832,7 +2735,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind unsigned int* vertex_cells = allocator.allocate(vertex_count); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid); size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); // build a quadric for each target cell @@ -1853,15 +2756,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind for (size_t i = 0; i < cell_count; ++i) result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error; - // collapse triangles! - // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :( + // vertex collapses often result in duplicate triangles; we need a table to filter them out size_t tritable_size = hashBuckets2(min_triangles); unsigned int* tritable = allocator.allocate(tritable_size); + // note: this is the first and last write to destination, which allows aliasing destination with indices size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap); #if TRACE - printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error)); + printf("result: grid size %d, %d cells, %d triangles (%d unfiltered), error %e\n", min_grid, int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error)); #endif if (out_result_error) @@ -1870,6 +2773,40 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind return write; } +size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float target_error) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_error >= 0); + + meshopt_Allocator allocator; + + unsigned int* result = destination; + if (result != indices) + memcpy(result, indices, index_count * sizeof(unsigned int)); + + // build position remap that maps each vertex to the one with identical position + unsigned int* remap = allocator.allocate(vertex_count); + buildPositionRemap(remap, NULL, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, allocator); + + Vector3* vertex_positions = allocator.allocate(vertex_count); + rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, NULL); + + unsigned int* components = allocator.allocate(vertex_count); + size_t component_count = buildComponents(components, vertex_count, indices, index_count, remap); + + float* component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents + measureComponents(component_errors, component_count, components, vertex_positions, vertex_count); + + float component_nexterror = 0; + size_t result_count = pruneComponents(result, index_count, components, component_errors, component_count, target_error * target_error, component_nexterror); + + return result_count; +} + size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count) { using namespace meshopt; @@ -1922,7 +2859,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos int grid_size = next_grid_size; grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, grid_size); size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count); #if TRACE @@ -1959,7 +2896,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos // build vertex->cell association by mapping all vertices with the same quantized position to the same cell unsigned int* vertex_cells = allocator.allocate(vertex_count); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, min_grid); size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); // accumulate points into a reservoir for each target cell @@ -1972,7 +2909,10 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos unsigned int* cell_remap = allocator.allocate(cell_count); float* cell_errors = allocator.allocate(cell_count); - fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight * color_weight, vertex_count); + // we scale the color weight to bring it to the same scale as position so that error addition makes sense + float color_weight_scaled = color_weight * (min_grid == 1 ? 1.f : 1.f / (min_grid - 1)); + + fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight_scaled * color_weight_scaled, vertex_count); // copy results to the output assert(cell_count <= target_vertex_count); diff --git a/Source/ThirdParty/meshoptimizer/spatialorder.cpp b/Source/ThirdParty/meshoptimizer/spatialorder.cpp index 7b1a06945..8a785fcd5 100644 --- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp +++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp @@ -10,18 +10,19 @@ namespace meshopt { -// "Insert" two 0 bits after each of the 10 low bits of x -inline unsigned int part1By2(unsigned int x) +// "Insert" two 0 bits after each of the 20 low bits of x +inline unsigned long long part1By2(unsigned long long x) { - x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210 - x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210 - x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210 - x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10 - x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 + x &= 0x000fffffull; // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210 + x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210 + x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210 + x = (x ^ (x << 8)) & 0x000f00f00f00f00full; // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210 + x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull; // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10 + x = (x ^ (x << 2)) & 0x0249249249249249ull; // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 return x; } -static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); - float scale = extent == 0 ? 0.f : 1.f / extent; + // rescale each axis to 16 bits to get 48-bit Morton codes + float scale = extent == 0 ? 0.f : 65535.f / extent; // generate Morton order based on the position inside a unit cube for (size_t i = 0; i < vertex_count; ++i) { const float* v = vertex_positions_data + i * vertex_stride_float; - int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f); - int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f); - int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f); + int x = int((v[0] - minv[0]) * scale + 0.5f); + int y = int((v[1] - minv[1]) * scale + 0.5f); + int z = int((v[2] - minv[2]) * scale + 0.5f); - result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); + if (morton) + result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); + else + result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40); } } -static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count) +static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count) { + unsigned int hist[1024]; memset(hist, 0, sizeof(hist)); - // compute 3 10-bit histograms in parallel + // compute histogram (assume keys are 10-bit) for (size_t i = 0; i < count; ++i) - { - unsigned int id = data[i]; + hist[keys[i]]++; - hist[(id >> 0) & 1023][0]++; - hist[(id >> 10) & 1023][1]++; - hist[(id >> 20) & 1023][2]++; - } - - unsigned int sumx = 0, sumy = 0, sumz = 0; + unsigned int sum = 0; // replace histogram data with prefix histogram sums in-place for (int i = 0; i < 1024; ++i) { - unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; - - hist[i][0] = sumx; - hist[i][1] = sumy; - hist[i][2] = sumz; - - sumx += hx; - sumy += hy; - sumz += hz; + unsigned int h = hist[i]; + hist[i] = sum; + sum += h; } - assert(sumx == count && sumy == count && sumz == count); + assert(sum == count); + + // reorder values + for (size_t i = 0; i < count; ++i) + { + unsigned int id = keys[source[i]]; + + destination[hist[id]++] = source[i]; + } } -static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass) +static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count) { - int bitoff = pass * 10; + memset(hist, 0, sizeof(hist)); + + // compute 2 8-bit histograms in parallel + for (size_t i = 0; i < count; ++i) + { + unsigned long long id = data[i]; + + hist[(id >> 0) & 255][0]++; + hist[(id >> 8) & 255][1]++; + } + + unsigned int sum0 = 0, sum1 = 0; + + // replace histogram data with prefix histogram sums in-place + for (int i = 0; i < 256; ++i) + { + unsigned int h0 = hist[i][0], h1 = hist[i][1]; + + hist[i][0] = sum0; + hist[i][1] = sum1; + + sum0 += h0; + sum1 += h1; + } + + assert(sum0 == count && sum1 == count); +} + +static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass) +{ + int bitoff = pass * 8; for (size_t i = 0; i < count; ++i) { - unsigned int id = (keys[source[i]] >> bitoff) & 1023; + unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255; destination[hist[id][pass]++] = source[i]; } } +static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count) +{ + size_t l = 0, r = split; + + for (size_t i = 0; i < count; ++i) + { + unsigned char side = sides[order[i]]; + target[side ? r : l] = order[i]; + l += 1; + l -= side; + r += side; + } + + assert(l == split && r == count); +} + +static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size) +{ + if (count <= cluster_size) + { + memcpy(destination, orderx, count * sizeof(unsigned int)); + return; + } + + unsigned int* axes[3] = {orderx, ordery, orderz}; + + int bestk = -1; + unsigned int bestdim = 0; + + for (int k = 0; k < 3; ++k) + { + const unsigned int mask = (1 << 20) - 1; + unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask); + + if (dim >= bestdim) + { + bestk = k; + bestdim = dim; + } + } + + assert(bestk >= 0); + + // split roughly in half, with the left split always being aligned to cluster size + size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size; + assert(split > 0 && split < count); + + // mark sides of split for partitioning + unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int); + + for (size_t i = 0; i < split; ++i) + sides[axes[bestk][i]] = 0; + + for (size_t i = split; i < count; ++i) + sides[axes[bestk][i]] = 1; + + // partition all axes into two sides, maintaining order + unsigned int* temp = static_cast(scratch); + + for (int k = 0; k < 3; ++k) + { + if (k == bestk) + continue; + + unsigned int* axis = axes[k]; + memcpy(temp, axis, sizeof(unsigned int) * count); + partitionPoints(axis, temp, sides, split, count); + } + + // recursion depth is logarithmic and bounded as we always split in approximately half + splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size); + splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size); +} + } // namespace meshopt void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) @@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos meshopt_Allocator allocator; - unsigned int* keys = allocator.allocate(vertex_count); - computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride); + unsigned long long* keys = allocator.allocate(vertex_count); + computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true); - unsigned int hist[1024][3]; - computeHistogram(hist, keys, vertex_count); - - unsigned int* scratch = allocator.allocate(vertex_count); + unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 2b for keys + unsigned short* keyk = (unsigned short*)(scratch + vertex_count); for (size_t i = 0; i < vertex_count; ++i) destination[i] = unsigned(i); - // 3-pass radix sort computes the resulting order into scratch - radixPass(scratch, destination, keys, vertex_count, hist, 0); - radixPass(destination, scratch, keys, vertex_count, hist, 1); - radixPass(scratch, destination, keys, vertex_count, hist, 2); + unsigned int* order[] = {scratch, destination}; + + // 5-pass radix sort computes the resulting order into scratch + for (int k = 0; k < 5; ++k) + { + // copy 10-bit key segments into keyk to reduce cache pressure during radix pass + for (size_t i = 0; i < vertex_count; ++i) + keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023); + + radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count); + } // since our remap table is mapping old=>new, we need to reverse it for (size_t i = 0; i < vertex_count; ++i) @@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* destination[r * 3 + 2] = c; } } + +void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size) +{ + using namespace meshopt; + + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(cluster_size > 0); + + meshopt_Allocator allocator; + + unsigned long long* keys = allocator.allocate(vertex_count); + computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false); + + unsigned int* order = allocator.allocate(vertex_count * 3); + unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 1b for side or 2b for keys + unsigned short* keyk = reinterpret_cast(scratch + vertex_count); + + for (int k = 0; k < 3; ++k) + { + // copy 16-bit key segments into keyk to reduce cache pressure during radix pass + for (size_t i = 0; i < vertex_count; ++i) + keyk[i] = (unsigned short)(keys[i] >> (k * 20)); + + unsigned int hist[256][2]; + computeHistogram(hist, keyk, vertex_count); + + for (size_t i = 0; i < vertex_count; ++i) + order[k * vertex_count + i] = unsigned(i); + + radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0); + radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1); + } + + splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size); +} diff --git a/Source/ThirdParty/meshoptimizer/stripifier.cpp b/Source/ThirdParty/meshoptimizer/stripifier.cpp index d57fb512b..4043195ae 100644 --- a/Source/ThirdParty/meshoptimizer/stripifier.cpp +++ b/Source/ThirdParty/meshoptimizer/stripifier.cpp @@ -10,14 +10,14 @@ namespace meshopt { -static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence) +static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence) { unsigned int index = 0; unsigned int iv = ~0u; for (size_t i = 0; i < buffer_size; ++i) { - unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]]; + unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]]; unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc); if (v < iv) @@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t strip_size = 0; // compute vertex valence; this is used to prioritize starting triangle for strips - unsigned int* valence = allocator.allocate(vertex_count); - memset(valence, 0, vertex_count * sizeof(unsigned int)); + // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic + unsigned char* valence = allocator.allocate(vertex_count); + memset(valence, 0, vertex_count); for (size_t i = 0; i < index_count; ++i) { @@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, { // if we didn't find anything, we need to find the next new triangle // we use a heuristic to maximize the strip length - unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]); + unsigned int i = findStripFirst(buffer, buffer_size, valence); unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2]; // ordered removal from the buffer diff --git a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp index 94f7a1adc..7085cce32 100644 --- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp +++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp @@ -60,6 +60,15 @@ #define SIMD_LATENCYOPT #endif +// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks +#if defined(__GNUC__) +#define SIMD_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define SIMD_UNREACHABLE() __assume(false) +#else +#define SIMD_UNREACHABLE() assert(!"Unreachable") +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -90,6 +99,14 @@ #include #endif +#ifndef TRACE +#define TRACE 0 +#endif + +#if TRACE +#include +#endif + #ifdef SIMD_WASM #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i) #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) @@ -105,50 +122,76 @@ namespace meshopt const unsigned char kVertexHeader = 0xa0; -static int gEncodeVertexVersion = 0; +static int gEncodeVertexVersion = 1; +const int kDecodeVertexVersion = 1; const size_t kVertexBlockSizeBytes = 8192; const size_t kVertexBlockMaxSize = 256; const size_t kByteGroupSize = 16; const size_t kByteGroupDecodeLimit = 24; -const size_t kTailMaxSize = 32; +const size_t kTailMinSizeV0 = 32; +const size_t kTailMinSizeV1 = 24; + +static const int kBitsV0[4] = {0, 2, 4, 8}; +static const int kBitsV1[5] = {0, 1, 2, 4, 8}; + +const int kEncodeDefaultLevel = 2; static size_t getVertexBlockSize(size_t vertex_size) { - // make sure the entire block fits into the scratch buffer - size_t result = kVertexBlockSizeBytes / vertex_size; - - // align to byte group size; we encode each byte as a byte group - // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size - result &= ~(kByteGroupSize - 1); + // make sure the entire block fits into the scratch buffer and is aligned to byte group size + // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility + size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; } -inline unsigned char zigzag8(unsigned char v) +inline unsigned int rotate(unsigned int v, int r) { - return ((signed char)(v) >> 7) ^ (v << 1); + return (v << r) | (v >> ((32 - r) & 31)); } -inline unsigned char unzigzag8(unsigned char v) +template +inline T zigzag(T v) { - return -(v & 1) ^ (v >> 1); + return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); } +template +inline T unzigzag(T v) +{ + return (0 - (v & 1)) ^ (v >> 1); +} + +#if TRACE +struct Stats +{ + size_t size; + size_t header; // bytes for header + size_t bitg[9]; // bytes for bit groups + size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group + size_t ctrl[4]; // number of control groups +}; + +static Stats* bytestats = NULL; +static Stats vertexstats[256]; +#endif + static bool encodeBytesGroupZero(const unsigned char* buffer) { - for (size_t i = 0; i < kByteGroupSize; ++i) - if (buffer[i]) - return false; + assert(kByteGroupSize == sizeof(unsigned long long) * 2); - return true; + unsigned long long v[2]; + memcpy(v, buffer, sizeof(v)); + + return (v[0] | v[1]) == 0; } static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); - if (bits == 1) + if (bits == 0) return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); if (bits == 8) @@ -166,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); + assert(kByteGroupSize % 8 == 0); - if (bits == 1) + if (bits == 0) return data; if (bits == 8) @@ -196,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* byte |= enc; } + // encode 1-bit groups in reverse bit order + // this makes them faster to decode alongside other groups + if (bits == 1) + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + *data++ = byte; } for (size_t i = 0; i < kByteGroupSize; ++i) { - if (buffer[i] >= sentinel) - { - *data++ = buffer[i]; - } + unsigned char v = buffer[i]; + + // branchless append of out-of-range values + *data = v; + data += v >= sentinel; } return data; } -static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size) +static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4]) { assert(buffer_size % kByteGroupSize == 0); @@ -226,69 +276,301 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, memset(header, 0, header_size); + int last_bits = -1; + for (size_t i = 0; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) return NULL; - int best_bits = 8; - size_t best_size = encodeBytesGroupMeasure(buffer + i, 8); + int best_bitk = 3; + size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); - for (int bits = 1; bits < 8; bits *= 2) + for (int bitk = 0; bitk < 3; ++bitk) { - size_t size = encodeBytesGroupMeasure(buffer + i, bits); + size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); - if (size < best_size) + // favor consistent bit selection across groups, but never replace literals + if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) { - best_bits = bits; + best_bitk = bitk; best_size = size; } } - int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3)); - assert((1 << bitslog2) == best_bits); - size_t header_offset = i / kByteGroupSize; + header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); - header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2); - + int best_bits = bits[best_bitk]; unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); assert(data + best_size == next); data = next; + last_bits = best_bits; + +#if TRACE + bytestats->bitg[best_bits] += best_size; +#endif } +#if TRACE + bytestats->header += header_size; +#endif + return data; } -static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot) +{ + size_t k0 = k & ~(sizeof(T) - 1); + int ks = (k & (sizeof(T) - 1)) * 8; + + T p = last_vertex[k0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= T(last_vertex[k0 + j]) << (j * 8); + + const unsigned char* vertex = vertex_data + k0; + + for (size_t i = 0; i < vertex_count; ++i) + { + T v = vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= vertex[j] << (j * 8); + + T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); + + buffer[i] = (unsigned char)(d >> ks); + p = v; + vertex += vertex_size; + } +} + +static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel) +{ + switch (channel & 3) + { + case 0: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 1: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 2: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); + default: + assert(!"Unsupported channel encoding"); // unreachable + } +} + +static int estimateBits(unsigned char v) +{ + return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; +} + +static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size) +{ + size_t sizes[8] = {}; + + const unsigned char* vertex = vertex_data + k; + unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + + for (size_t i = 0; i < vertex_count; i += group_size) + { + unsigned int bitg = 0; + + // calculate bit consistency mask for the group + for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) + { + unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + unsigned int d = v ^ last; + + bitg |= d; + last = v; + vertex += vertex_size; + } + +#if TRACE + for (int j = 0; j < 32; ++j) + vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); +#endif + + for (int j = 0; j < 8; ++j) + { + unsigned int bitr = rotate(bitg, j); + + sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); + sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); + } + } + + int best_rot = 0; + for (int rot = 1; rot < 8; ++rot) + best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; + + return best_rot; +} + +static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot) +{ + unsigned char block[kVertexBlockMaxSize]; + assert(vertex_block_size <= kVertexBlockMaxSize); + + unsigned char last_vertex[256] = {}; + + size_t sizes[3] = {}; + assert(max_channel <= 3); + + for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) + { + size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; + size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + + memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); + + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize + if (block_size < block_size_aligned) + memset(block + block_size, 0, block_size_aligned - block_size); + + for (int channel = 0; channel < max_channel; ++channel) + for (size_t j = 0; j < 4; ++j) + { + encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); + + for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) + { + // to maximize encoding performance we only evaluate 1/2/4/8 bit groups + size_t size1 = encodeBytesGroupMeasure(block + ig, 1); + size_t size2 = encodeBytesGroupMeasure(block + ig, 2); + size_t size4 = encodeBytesGroupMeasure(block + ig, 4); + size_t size8 = encodeBytesGroupMeasure(block + ig, 8); + + size_t best_size = size1 < size2 ? size1 : size2; + best_size = best_size < size4 ? best_size : size4; + best_size = best_size < size8 ? best_size : size8; + + sizes[channel] += best_size; + } + } + } + + int best_channel = 0; + for (int channel = 1; channel < max_channel; ++channel) + best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; + + return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; +} + +static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned) +{ + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + if (!encodeBytesGroupZero(buffer + i)) + return false; + + return true; +} + +static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) +{ + if (estimateControlZero(buffer, vertex_count_aligned)) + return 2; // zero encoding + + if (level == 0) + return 1; // 1248 encoding in level 0 for encoding speed + + // round number of groups to 4 to get number of header bytes + size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; + + size_t est_bytes0 = header_size, est_bytes1 = header_size; + + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + { + // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance + size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); + size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); + size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); + size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); + size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); + + // both control modes have access to 1/2/4 bit encoding + size_t size12 = size1 < size2 ? size1 : size2; + size_t size124 = size12 < size4 ? size12 : size4; + + // each control mode has access to 0/8 bit encoding respectively + est_bytes0 += size124 < size0 ? size124 : size0; + est_bytes1 += size124 < size8 ? size124 : size8; + } + + // pick shortest control entry but prefer literal encoding + if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) + return est_bytes0 < est_bytes1 ? 0 : 1; + else + return 3; // literal encoding +} + +static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + assert(vertex_size % 4 == 0); unsigned char buffer[kVertexBlockMaxSize]; assert(sizeof(buffer) % kByteGroupSize == 0); + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize memset(buffer, 0, sizeof(buffer)); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + unsigned char* control = data; + data += control_size; + + memset(control, 0, control_size); + for (size_t k = 0; k < vertex_size; ++k) { - size_t vertex_offset = k; + encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); - unsigned char p = last_vertex[k]; +#if TRACE + const unsigned char* olddata = data; + bytestats = &vertexstats[k]; +#endif - for (size_t i = 0; i < vertex_count; ++i) + int ctrl = 0; + + if (version != 0) { - buffer[i] = zigzag8(vertex_data[vertex_offset] - p); + ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); - p = vertex_data[vertex_offset]; + assert(unsigned(ctrl) < 4); + control[k / 4] |= ctrl << ((k % 4) * 2); - vertex_offset += vertex_size; +#if TRACE + vertexstats[k].ctrl[ctrl]++; +#endif } - data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1)); - if (!data) - return NULL; + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; + + memcpy(data, buffer, vertex_count); + data += vertex_count; + } + else if (ctrl != 2) // non-zero encoding + { + data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } + +#if TRACE + bytestats = NULL; + vertexstats[k].size += data - olddata; +#endif } memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); @@ -297,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data } #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM)) -static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) +static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits) { #define READ() byte = *data++ #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1) @@ -305,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned unsigned char byte, enc, encv; const unsigned char* data_var; - switch (bitslog2) + switch (bits) { case 0: memset(buffer, 0, kByteGroupSize); return data; case 1: + data_var = data + 2; + + // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups) + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + + return data_var; + case 2: data_var = data + 4; // 4 groups with 4 2-bit values in each byte @@ -320,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); return data_var; - case 2: + case 4: data_var = data + 8; // 8 groups with 2 4-bit values in each byte @@ -334,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(4), NEXT(4); return data_var; - case 3: + case 8: memcpy(buffer, data, kByteGroupSize); return data + kByteGroupSize; default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + assert(!"Unexpected bit length"); // unreachable return data; } @@ -346,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned #undef NEXT } -static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits) { assert(buffer_size % kByteGroupSize == 0); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; for (size_t i = 0; i < buffer_size; i += kByteGroupSize) @@ -366,43 +658,109 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne return NULL; size_t header_offset = i / kByteGroupSize; + int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroup(data, buffer + i, bitslog2); + data = decodeBytesGroup(data, buffer + i, bits[bitsk]); } return data; } -static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot) { - assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); - - unsigned char buffer[kVertexBlockMaxSize]; - unsigned char transposed[kVertexBlockSizeBytes]; - - size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); - - for (size_t k = 0; k < vertex_size; ++k) + for (size_t k = 0; k < 4; k += sizeof(T)) { - data = decodeBytes(data, data_end, buffer, vertex_count_aligned); - if (!data) - return NULL; - size_t vertex_offset = k; - unsigned char p = last_vertex[k]; + T p = last_vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= last_vertex[j] << (8 * j); for (size_t i = 0; i < vertex_count; ++i) { - unsigned char v = unzigzag8(buffer[i]) + p; + T v = buffer[i]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= buffer[i + vertex_count * j] << (8 * j); + + v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p; + + for (size_t j = 0; j < sizeof(T); ++j) + transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8)); - transposed[vertex_offset] = v; p = v; vertex_offset += vertex_size; } + + buffer += vertex_count * sizeof(T); + last_vertex += sizeof(T); + } +} + +static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) +{ + assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + + unsigned char buffer[kVertexBlockMaxSize * 4]; + unsigned char transposed[kVertexBlockSizeBytes]; + + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + assert(vertex_count <= vertex_count_aligned); + + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + + for (size_t k = 0; k < vertex_size; k += 4) + { + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; + + for (size_t j = 0; j < 4; ++j) + { + int ctrl = (ctrl_byte >> (j * 2)) & 3; + + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; + + memcpy(buffer + j * vertex_count, data, vertex_count); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count, 0, vertex_count); + } + else + { + data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } + } + + int channel = version == 0 ? 0 : channels[k / 4]; + + switch (channel & 3) + { + case 0: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type + } } memcpy(vertex_data, transposed, vertex_count * vertex_size); @@ -447,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); #ifdef SIMD_SSE SIMD_TARGET -static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) { __m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0])); __m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1])); @@ -459,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -473,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef __GNUC__ typedef int __attribute__((aligned(1))) unaligned_int; @@ -505,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -518,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -541,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -554,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -562,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2)); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + __m128i shuf = decodeShuffleMask(mask0, mask1); + __m128i result = _mm_shuffle_epi8(rest, shuf); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_AVX -static const __m128i decodeBytesGroupConfig[] = { - _mm_set1_epi8(3), - _mm_set1_epi8(15), - _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24), - _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56), +static const __m128i kDecodeBytesGroupConfig[8][2] = { + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, }; -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -590,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data; } - case 1: - case 2: + case 5: // 1-bit + case 1: // 2-bit + case 6: + case 2: // 4-bit + case 7: { - const unsigned char* skip = data + (bitslog2 << 2); + const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5)); __m128i selb = _mm_loadl_epi64(reinterpret_cast(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast(skip)); - __m128i sent = decodeBytesGroupConfig[bitslog2 - 1]; - __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1]; + __m128i sent = kDecodeBytesGroupConfig[hbits][0]; + __m128i ctrl = kDecodeBytesGroupConfig[hbits][1]; __m128i selw = _mm_shuffle_epi32(selb, 0x44); __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); @@ -613,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -622,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_NEON -static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) +SIMD_TARGET +inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) { uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]); uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]); @@ -640,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 return vcombine_u8(r0, r1); } -static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) +SIMD_TARGET +inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -651,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { uint8x16_t result = vdupq_n_u8(0); @@ -665,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef SIMD_LATENCYOPT unsigned int data32; @@ -702,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -736,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { uint8x16_t result = vld1q_u8(data); @@ -744,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + uint8x8_t rest0 = vld1_u8(data + 2); + uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]); + + uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1); + + vst1q_u8(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_WASM SIMD_TARGET -static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) { v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); - v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); - sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - + v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]); v128_t sm1r = wasm_i8x16_add(sm1, sm1off); return wasmx_unpacklo_v64x2(sm0, sm1r); } SIMD_TARGET -static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) +inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -777,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1 } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { v128_t result = wasm_i8x16_splat(0); @@ -791,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { v128_t sel2 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 4); @@ -805,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -814,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { v128_t sel4 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 8); @@ -827,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -836,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { v128_t result = wasm_v128_load(data); @@ -844,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + v128_t rest = wasm_v128_load(data + 2); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + v128_t shuf = decodeShuffleMask(mask0, mask1); + v128_t result = wasm_i8x16_swizzle(rest, shuf); + + wasm_v128_store(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) SIMD_TARGET -static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) +inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) { __m128i t0 = _mm_unpacklo_epi8(x0, x1); __m128i t1 = _mm_unpackhi_epi8(x0, x1); @@ -867,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) } SIMD_TARGET -static __m128i unzigzag8(__m128i v) +inline __m128i unzigzag8(__m128i v) { __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); return _mm_xor_si128(xl, xr); } + +SIMD_TARGET +inline __m128i unzigzag16(__m128i v) +{ + __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); + __m128i xr = _mm_srli_epi16(v, 1); + + return _mm_xor_si128(xl, xr); +} + +SIMD_TARGET +inline __m128i rotate32(__m128i v, int r) +{ + return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); +} #endif #ifdef SIMD_NEON -static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) +SIMD_TARGET +inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) { uint8x16x2_t t01 = vzipq_u8(x0, x1); uint8x16x2_t t23 = vzipq_u8(x2, x3); @@ -891,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_ x3 = vreinterpretq_u8_u16(x23.val[1]); } -static uint8x16_t unzigzag8(uint8x16_t v) +SIMD_TARGET +inline uint8x16_t unzigzag8(uint8x16_t v) { uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); uint8x16_t xr = vshrq_n_u8(v, 1); return veorq_u8(xl, xr); } + +SIMD_TARGET +inline uint8x16_t unzigzag16(uint8x16_t v) +{ + uint16x8_t vv = vreinterpretq_u16_u8(v); + uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1))))); + uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1)); + + return veorq_u8(xl, xr); +} + +SIMD_TARGET +inline uint8x16_t rotate32(uint8x16_t v, int r) +{ + uint32x4_t v32 = vreinterpretq_u32_u8(v); + return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32)))); +} + +template +SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3) +{ + switch (Channel) + { + case 0: + { + uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3)); + uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + case 1: + { + uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3))); + uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum)); + return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2))); + } + case 2: + { + uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3)); + uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + default: + return npi; + } +} #endif #ifdef SIMD_WASM SIMD_TARGET -static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) +inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) { v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); @@ -916,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) } SIMD_TARGET -static v128_t unzigzag8(v128_t v) +inline v128_t unzigzag8(v128_t v) { v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); v128_t xr = wasm_u8x16_shr(v, 1); return wasm_v128_xor(xl, xr); } + +SIMD_TARGET +inline v128_t unzigzag16(v128_t v) +{ + v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1))); + v128_t xr = wasm_u16x8_shr(v, 1); + + return wasm_v128_xor(xl, xr); +} + +SIMD_TARGET +inline v128_t rotate32(v128_t v, int r) +{ + return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r)); +} #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) SIMD_TARGET -static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift) { assert(buffer_size % kByteGroupSize == 0); assert(kByteGroupSize == 16); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; size_t i = 0; - // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b + // fast-path: process 4 groups at a time, do a shared bounds check for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) { size_t header_offset = i / kByteGroupSize; unsigned char header_byte = header[header_offset / 4]; - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); } // slow-path: process remaining groups @@ -963,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns return NULL; size_t header_offset = i / kByteGroupSize; + unsigned char header_byte = header[header_offset / 4]; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroupSimd(data, buffer + i, bitslog2); + data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); } return data; } +template +SIMD_TARGET static void +decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot) +{ +#if defined(SIMD_SSE) || defined(SIMD_AVX) +#define TEMP __m128i +#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) +#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) +#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) +#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size +#endif + +#ifdef SIMD_NEON +#define TEMP uint8x8_t +#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) +#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) +#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) +#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size +#endif + +#ifdef SIMD_WASM +#define TEMP v128_t +#define PREP() v128_t pi = wasm_v128_load(last_vertex) +#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) +#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size +#endif + +#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) + + PREP(); + + unsigned char* savep = transposed; + + for (size_t j = 0; j < vertex_count_aligned; j += 16) + { + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + + transpose8(r0, r1, r2, r3); + + TEMP t0, t1, t2, t3; + TEMP npi = pi; + + UNZR(0); + GRP4(0); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(1); + GRP4(1); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(2); + GRP4(2); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(3); + GRP4(3); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + +#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) + // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations + pi = rebase(npi, r0, r1, r2, r3); +#else + (void)npi; +#endif + +#undef UNZR +#undef TEMP +#undef PREP +#undef LOAD +#undef GRP4 +#undef FIXD +#undef SAVE + } +} + SIMD_TARGET -static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); @@ -982,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + for (size_t k = 0; k < vertex_size; k += 4) { + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; + for (size_t j = 0; j < 4; ++j) { - data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned); - if (!data) - return NULL; + int ctrl = (ctrl_byte >> (j * 2)) & 3; + + if (ctrl == 3) + { + // literal encoding; safe to over-copy due to tail + if (size_t(data_end - data) < vertex_count_aligned) + return NULL; + + memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); + } + else + { + // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 + int hshift = version == 0 ? 0 : 4 + ctrl; + + data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); + if (!data) + return NULL; + } } -#if defined(SIMD_SSE) || defined(SIMD_AVX) -#define TEMP __m128i -#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex + k)) -#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) -#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) -#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size -#endif + int channel = version == 0 ? 0 : channels[k / 4]; -#ifdef SIMD_NEON -#define TEMP uint8x8_t -#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex + k), vdup_n_u32(0), 0)) -#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) -#define FIXD(i) t##i = pi = vadd_u8(pi, t##i) -#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size -#endif - -#ifdef SIMD_WASM -#define TEMP v128_t -#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) -#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) -#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size -#endif - - PREP(); - - unsigned char* savep = transposed + k; - - for (size_t j = 0; j < vertex_count_aligned; j += 16) + switch (channel & 3) { - LOAD(0); - LOAD(1); - LOAD(2); - LOAD(3); - - r0 = unzigzag8(r0); - r1 = unzigzag8(r1); - r2 = unzigzag8(r2); - r3 = unzigzag8(r3); - - transpose8(r0, r1, r2, r3); - - TEMP t0, t1, t2, t3; - - GRP4(0); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(1); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(2); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(3); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - -#undef TEMP -#undef PREP -#undef LOAD -#undef GRP4 -#undef FIXD -#undef SAVE + case 0: + decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type } } @@ -1088,23 +1643,29 @@ static unsigned int cpuid = getCpuFeatures(); } // namespace meshopt -size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version) { using namespace meshopt; assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); + assert(level >= 0 && level <= 9); // only a subset of this range is used right now + assert(version < 0 || unsigned(version) <= kDecodeVertexVersion); + + version = version < 0 ? gEncodeVertexVersion : version; + +#if TRACE + memset(vertexstats, 0, sizeof(vertexstats)); +#endif const unsigned char* vertex_data = static_cast(vertices); unsigned char* data = buffer; unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return 0; - int version = gEncodeVertexVersion; - *data++ = (unsigned char)(kVertexHeader | version); unsigned char first_vertex[256] = {}; @@ -1116,40 +1677,110 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con size_t vertex_block_size = getVertexBlockSize(vertex_size); + unsigned char channels[64] = {}; + if (version != 0 && level > 1 && vertex_count > 1) + for (size_t k = 0; k < vertex_size; k += 4) + { + int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; + int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); + + assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); + channels[k / 4] = (unsigned char)channel; + } + size_t vertex_offset = 0; while (vertex_offset < vertex_count) { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); if (!data) return 0; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; - if (size_t(data_end - data) < tail_size) + if (size_t(data_end - data) < tail_size_pad) return 0; - // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder - if (vertex_size < kTailMaxSize) + if (tail_size < tail_size_pad) { - memset(data, 0, kTailMaxSize - vertex_size); - data += kTailMaxSize - vertex_size; + memset(data, 0, tail_size_pad - tail_size); + data += tail_size_pad - tail_size; } memcpy(data, first_vertex, vertex_size); data += vertex_size; + if (version != 0) + { + memcpy(data, channels, vertex_size / 4); + data += vertex_size / 4; + } + assert(data >= buffer + tail_size); assert(data <= buffer + buffer_size); +#if TRACE + size_t total_size = data - buffer; + + for (size_t k = 0; k < vertex_size; ++k) + { + const Stats& vsk = vertexstats[k]; + + printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); + + size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; + double total_kr = total_k ? 1.0 / double(total_k) : 0; + + if (version != 0) + { + int channel = channels[k / 4]; + + if ((channel & 3) == 2 && k % 4 == 0) + printf(" | ^%d", channel >> 4); + else + printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); + } + + printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", + double(vsk.header) * total_kr * 100, + double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, + double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); + + size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; + + if (total_ctrl) + { + printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", + double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, + double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); + } + + if (level >= 3) + printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", + double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, + double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, + double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, + double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); + + printf("\n"); + } +#endif + return data - buffer; } +size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion); +} + size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) { using namespace meshopt; @@ -1160,21 +1791,42 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) size_t vertex_block_size = getVertexBlockSize(vertex_size); size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; + size_t vertex_block_control_size = vertex_size / 4; size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; size_t vertex_block_data_size = vertex_block_size; - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (vertex_size / 4); + size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + assert(tail_size_pad >= kByteGroupDecodeLimit); - return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size; + return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; } void meshopt_encodeVertexVersion(int version) { - assert(unsigned(version) <= 0); + assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion)); meshopt::gEncodeVertexVersion = version; } +int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size) +{ + if (buffer_size < 1) + return -1; + + unsigned char header = buffer[0]; + + if ((header & 0xf0) != meshopt::kVertexHeader) + return -1; + + int version = header & 0x0f; + if (version > meshopt::kDecodeVertexVersion) + return -1; + + return version; +} + int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size) { using namespace meshopt; @@ -1182,7 +1834,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); - const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL; + const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; @@ -1202,7 +1854,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve const unsigned char* data = buffer; const unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return -2; unsigned char data_header = *data++; @@ -1211,11 +1863,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve return -1; int version = data_header & 0x0f; - if (version > 0) + if (version > kDecodeVertexVersion) return -1; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + + if (size_t(data_end - data) < tail_size_pad) + return -2; + + const unsigned char* tail = data_end - tail_size; + unsigned char last_vertex[256]; - memcpy(last_vertex, data_end - vertex_size, vertex_size); + memcpy(last_vertex, tail, vertex_size); + + const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; size_t vertex_block_size = getVertexBlockSize(vertex_size); @@ -1225,16 +1888,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); if (!data) return -2; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; - - if (size_t(data_end - data) != tail_size) + if (size_t(data_end - data) != tail_size_pad) return -3; return 0; @@ -1246,3 +1907,4 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve #undef SIMD_WASM #undef SIMD_FALLBACK #undef SIMD_TARGET +#undef SIMD_LATENCYOPT diff --git a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp index 4b5f444f0..3fd836083 100644 --- a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp +++ b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp @@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count) static void decodeFilterQuat(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; ++i) { // recover scale from the high byte of the component int sf = data[i * 4 + 3] | 3; - float ss = scale / float(sf); + float s = float(sf); - // convert x/y/z to [-1..1] (scaled...) - float x = float(data[i * 4 + 0]) * ss; - float y = float(data[i * 4 + 1]) * ss; - float z = float(data[i * 4 + 2]) * ss; + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float x = float(data[i * 4 + 0]); + float y = float(data[i * 4 + 1]); + float z = float(data[i * 4 + 2]); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float ww = 1.f - x * x - y * y - z * z; + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float ws = s * s; + float ww = ws * 2.f - x * x - y * y - z * z; float w = sqrtf(ww >= 0.f ? ww : 0.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float ss = scale / s; + // rounded signed float->int - int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f)); - int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f)); - int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f)); - int wf = int(w * 32767.f + 0.5f); + int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f)); + int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f)); + int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f)); + int wf = int(w * ss + 0.5f); int qc = data[i * 4 + 3] & 3; @@ -165,6 +170,47 @@ static void decodeFilterExp(unsigned int* data, size_t count) data[i] = u.ui; } } + +template +static void decodeFilterColor(T* data, size_t count) +{ + const float max = float((1 << (sizeof(T) * 8)) - 1); + + for (size_t i = 0; i < count; ++i) + { + // recover scale from alpha high bit + int as = data[i * 4 + 3]; + as |= as >> 1; + as |= as >> 2; + as |= as >> 4; + as |= as >> 8; // noop for 8-bit + + // convert to RGB in fixed point (co/cg are sign extended) + int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]); + + int r = y + co - cg; + int g = y + cg; + int b = y - co - cg; + + // expand alpha by one bit to match other components + int a = data[i * 4 + 3]; + a = ((a << 1) & as) | (a & 1); + + // compute scaling factor + float ss = max / float(as); + + // rounded float->int + int rf = int(float(r) * ss + 0.5f); + int gf = int(float(g) * ss + 0.5f); + int bf = int(float(b) * ss + 0.5f); + int af = int(float(a) * ss + 0.5f); + + data[i * 4 + 0] = T(rf); + data[i * 4 + 1] = T(gf); + data[i * 4 + 2] = T(bf); + data[i * 4 + 3] = T(af); + } +} #endif #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) @@ -201,7 +247,7 @@ inline uint64_t rotateleft64(uint64_t v, int x) #endif #ifdef SIMD_SSE -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const __m128 sign = _mm_set1_ps(-0.f); @@ -246,7 +292,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count) } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const __m128 sign = _mm_set1_ps(-0.f); @@ -295,8 +341,9 @@ static void decodeFilterOctSimd(short* data, size_t count) __m128i res_1 = _mm_unpackhi_epi16(xzr, y0r); // patch in .w - res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000))); - res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000))); + __m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0); + res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw)); + res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw)); _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0); _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1); @@ -305,7 +352,7 @@ static void decodeFilterOctSimd(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -324,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3)); - __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf)); + __m128 s = _mm_cvtepi32_ps(sf); - // convert x/y/z to [-1..1] (scaled...) - __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss); - __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss); - __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + __m128 x = _mm_cvtepi32_ps(xf); + __m128 y = _mm_cvtepi32_ps(yf); + __m128 z = _mm_cvtepi32_ps(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + __m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication + __m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps())); - __m128 s = _mm_set1_ps(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + __m128 ss = _mm_div_ps(_mm_set1_ps(scale), s); // rounded signed float->int - __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); - __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); - __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); - __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s)); + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss)); + __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss)); // mix x/z and w/y to make 16-bit unpack easier __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16)); @@ -385,6 +435,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) _mm_storeu_ps(reinterpret_cast(&data[i]), r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + __m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4])); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + __m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff)); + __m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24); + __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24); + __m128i af = _mm_srli_epi32(c4, 24); + + // recover scale from alpha high bit + __m128i as = af; + as = _mm_or_si128(as, _mm_srli_epi32(as, 1)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 2)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 4)); + + // expand alpha by one bit to match other components + af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1))); + + // compute scaling factor + __m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as))); + + // convert to RGB in fixed point + __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf)); + __m128i gf = _mm_add_epi32(yf, cgf); + __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf)); + + // rounded signed float->int + __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss)); + __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss)); + __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss)); + __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss)); + + // repack rgba into final value + __m128i res = rr; + res = _mm_or_si128(res, _mm_slli_epi32(gr, 8)); + res = _mm_or_si128(res, _mm_slli_epi32(br, 16)); + res = _mm_or_si128(res, _mm_slli_epi32(ar, 24)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + __m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4])); + __m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4])); + + // gather both y/co 16-bit pairs in each 32-bit lane + __m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0))); + __m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1))); + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + __m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff)); + __m128i cof = _mm_srai_epi32(c4_yco, 16); + __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16); + __m128i af = _mm_srli_epi32(c4_cga, 16); + + // recover scale from alpha high bit + __m128i as = af; + as = _mm_or_si128(as, _mm_srli_epi32(as, 1)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 2)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 4)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 8)); + + // expand alpha by one bit to match other components + af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1))); + + // compute scaling factor + __m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as)); + + // convert to RGB in fixed point + __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf)); + __m128i gf = _mm_add_epi32(yf, cgf); + __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf)); + + // rounded signed float->int + __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss)); + __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss)); + __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss)); + __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss)); + + // mix r/b and g/a to make 16-bit unpack easier + __m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16)); + __m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16)); + + // pack r/g/b/a using 16-bit unpacks + __m128i res_0 = _mm_unpacklo_epi16(rbr, gar); + __m128i res_1 = _mm_unpackhi_epi16(rbr, gar); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1); + } +} #endif #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) @@ -401,10 +550,17 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y) r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate return vmulq_f32(x, r); } + +#ifndef __ARM_FEATURE_FMA +inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z) +{ + return vaddq_f32(x, vmulq_f32(y, z)); +} +#endif #endif #ifdef SIMD_NEON -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const int32x4_t sign = vdupq_n_s32(0x80000000); @@ -431,29 +587,27 @@ static void decodeFilterOctSimd(signed char* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); float32x4_t rl = vrsqrteq_f32(ll); float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // combine xr/yr/zr into final value - int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000)); - res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff))); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16)); + int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8); + res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res); vst1q_s32(reinterpret_cast(&data[i * 4]), res); } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const int32x4_t sign = vdupq_n_s32(0x80000000); @@ -485,21 +639,25 @@ static void decodeFilterOctSimd(short* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); +#if !defined(__aarch64__) && !defined(_M_ARM64) float32x4_t rl = vrsqrteq_f32(ll); rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl); +#else + float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll)); +#endif // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // mix x/z and y/0 to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff)); // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w @@ -517,7 +675,7 @@ static void decodeFilterOctSimd(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -536,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3)); - float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf)); + float32x4_t s = vcvtq_f32_s32(sf); - // convert x/y/z to [-1..1] (scaled...) - float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss); - float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss); - float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float32x4_t x = vcvtq_f32_s32(xf); + float32x4_t y = vcvtq_f32_s32(yf); + float32x4_t z = vcvtq_f32_s32(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float32x4_t ws = vmulq_f32(s, s); + float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z)); float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f))); - float32x4_t s = vdupq_n_f32(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); - int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss)); + int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss)); // mix x/z and w/y to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); - int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); + int32x4_t wyr = vsliq_n_s32(wr, yr, 16); // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) - int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); - int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); + uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + + // store results to stack so that we can rotate using scalar instructions + // TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808 + volatile uint64_t res[4]; + vst1q_u64(const_cast(&res[0]), res_0); + vst1q_u64(const_cast(&res[2]), res_1); // rotate and store - uint64_t* out = (uint64_t*)&data[i * 4]; + uint64_t* out = reinterpret_cast(&data[i * 4]); - out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4); - out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4); - out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4); - out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4); + out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4); + out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4); + out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4); + out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4); } } @@ -595,10 +762,112 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) vst1q_f32(reinterpret_cast(&data[i]), r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + int32x4_t c4 = vld1q_s32(reinterpret_cast(&data[i * 4])); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff)); + int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24); + int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24); + int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24)); + + // recover scale from alpha high bit + int32x4_t as = af; + as = vorrq_s32(as, vshrq_n_s32(as, 1)); + as = vorrq_s32(as, vshrq_n_s32(as, 2)); + as = vorrq_s32(as, vshrq_n_s32(as, 4)); + + // expand alpha by one bit to match other components + af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1))); + + // compute scaling factor + float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as))); + + // convert to RGB in fixed point + int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf)); + int32x4_t gf = vaddq_s32(yf, cgf); + int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); + + // repack rgba into final value + int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8); + + vst1q_s32(reinterpret_cast(&data[i * 4]), res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + int32x4_t c4_0 = vld1q_s32(reinterpret_cast(&data[(i + 0) * 4])); + int32x4_t c4_1 = vld1q_s32(reinterpret_cast(&data[(i + 2) * 4])); + + // gather both y/co 16-bit pairs in each 32-bit lane + int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0]; + int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1]; + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff)); + int32x4_t cof = vshrq_n_s32(c4_yco, 16); + int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16); + int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16)); + + // recover scale from alpha high bit + int32x4_t as = af; + as = vorrq_s32(as, vshrq_n_s32(as, 1)); + as = vorrq_s32(as, vshrq_n_s32(as, 2)); + as = vorrq_s32(as, vshrq_n_s32(as, 4)); + as = vorrq_s32(as, vshrq_n_s32(as, 8)); + + // expand alpha by one bit to match other components + af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1))); + + // compute scaling factor + float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as)); + + // convert to RGB in fixed point + int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf)); + int32x4_t gf = vaddq_s32(yf, cgf); + int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); + + // mix r/b and g/a to make 16-bit unpack easier + int32x4_t rbr = vsliq_n_s32(rr, br, 16); + int32x4_t gar = vsliq_n_s32(gr, ar, 16); + + // pack r/g/b/a using 16-bit unpacks + int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]); + int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]); + + vst1q_s32(reinterpret_cast(&data[(i + 0) * 4]), res_0); + vst1q_s32(reinterpret_cast(&data[(i + 2) * 4]), res_1); + } +} #endif #ifdef SIMD_WASM -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); @@ -647,10 +916,11 @@ static void decodeFilterOctSimd(signed char* data, size_t count) } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); - const v128_t zmask = wasm_i32x4_splat(0x7fff); + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zmask = wasm_i32x4_splat(0x7fff); for (size_t i = 0; i < count; i += 4) { @@ -711,7 +981,7 @@ static void decodeFilterOctSimd(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -730,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3)); - v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf)); + v128_t s = wasm_f32x4_convert_i32x4(sf); - // convert x/y/z to [-1..1] (scaled...) - v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss); - v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss); - v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + v128_t x = wasm_f32x4_convert_i32x4(xf); + v128_t y = wasm_f32x4_convert_i32x4(yf); + v128_t z = wasm_f32x4_convert_i32x4(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors // note: i32x4_max with 0 is equivalent to f32x4_max - v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); + v128_t ws = wasm_f32x4_mul(s, s); + v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0))); - v128_t s = wasm_f32x4_splat(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const v128_t fsnap = wasm_f32x4_splat(3 << 22); - v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); - v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); - v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); - v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap); + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap); + v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap); // mix x/z and w/y to make 16-bit unpack easier v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16)); @@ -762,8 +1035,7 @@ static void decodeFilterQuatSimd(short* data, size_t count) v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr); // compute component index shifted left by 4 (and moved into i32x4 slot) - // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449 - volatile v128_t cm = wasm_i32x4_shl(cf, 4); + v128_t cm = wasm_i32x4_shl(cf, 4); // rotate and store uint64_t* out = reinterpret_cast(&data[i * 4]); @@ -794,6 +1066,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) wasm_v128_store(&data[i], r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zero = wasm_i32x4_splat(0); + + for (size_t i = 0; i < count; i += 4) + { + v128_t c4 = wasm_v128_load(&data[i * 4]); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff)); + v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24); + v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24); + v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24)); + + // recover scale from alpha high bit + v128_t as = af; + as = wasm_v128_or(as, wasm_i32x4_shr(as, 1)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 2)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 4)); + + // expand alpha by one bit to match other components + af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1))); + + // compute scaling factor + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as)); + + // convert to RGB in fixed point + v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf)); + v128_t gf = wasm_i32x4_add(yf, cgf); + v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap); + v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap); + v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap); + v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap); + + // repack rgba into final value + v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff)); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8)); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16)); + res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24)); + + wasm_v128_store(&data[i * 4], res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zero = wasm_i32x4_splat(0); + + for (size_t i = 0; i < count; i += 4) + { + v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]); + v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]); + + // gather both y/co 16-bit pairs in each 32-bit lane + v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1); + v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1); + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff)); + v128_t cof = wasm_i32x4_shr(c4_yco, 16); + v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16); + v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16)); + + // recover scale from alpha high bit + v128_t as = af; + as = wasm_v128_or(as, wasm_i32x4_shr(as, 1)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 2)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 4)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 8)); + + // expand alpha by one bit to match other components + af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1))); + + // compute scaling factor + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as)); + + // convert to RGB in fixed point + v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf)); + v128_t gf = wasm_i32x4_add(yf, cgf); + v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap); + v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap); + v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap); + v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap); + + // mix r/b and g/a to make 16-bit unpack easier + v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16)); + v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16)); + + // pack r/g/b/a using 16-bit unpacks + v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar); + v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar); + + wasm_v128_store(&data[(i + 0) * 4], res_0); + wasm_v128_store(&data[(i + 2) * 4], res_1); + } +} #endif // optimized variant of frexp @@ -807,7 +1190,7 @@ inline int optlog2(float v) u.f = v; // +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code - return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1; + return v == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1; } // optimized variant of ldexp @@ -833,9 +1216,9 @@ void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride) #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) if (stride == 4) - dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); + dispatchSimd(decodeFilterOctSimd8, static_cast(buffer), count, 4); else - dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); + dispatchSimd(decodeFilterOctSimd16, static_cast(buffer), count, 4); #else if (stride == 4) decodeFilterOct(static_cast(buffer), count); @@ -871,10 +1254,29 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride) #endif } +void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride) +{ + using namespace meshopt; + + assert(stride == 4 || stride == 8); + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + if (stride == 4) + dispatchSimd(decodeFilterColorSimd8, static_cast(buffer), count, 4); + else + dispatchSimd(decodeFilterColorSimd16, static_cast(buffer), count, 4); +#else + if (stride == 4) + decodeFilterColor(static_cast(buffer), count); + else + decodeFilterColor(static_cast(buffer), count); +#endif +} + void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data) { assert(stride == 4 || stride == 8); - assert(bits >= 1 && bits <= 16); + assert(bits >= 2 && bits <= 16); signed char* d8 = static_cast(destination); short* d16 = static_cast(destination); @@ -1010,6 +1412,20 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in component_exp[j] = (min_exp < e) ? e : min_exp; } } + else if (mode == meshopt_EncodeExpClamped) + { + for (size_t j = 0; j < stride_float; ++j) + { + int e = optlog2(v[j]); + + component_exp[j] = (0 < e) ? e : 0; + } + } + else + { + // the code below assumes component_exp is initialized outside of the loop + assert(mode == meshopt_EncodeExpSharedComponent); + } for (size_t j = 0; j < stride_float; ++j) { @@ -1020,7 +1436,6 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in // compute renormalized rounded mantissa for each component int mmask = (1 << 24) - 1; - int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f)); d[j] = (m & mmask) | (unsigned(exp) << 24); @@ -1028,6 +1443,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in } } +void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data) +{ + assert(stride == 4 || stride == 8); + assert(bits >= 2 && bits <= 16); + + unsigned char* d8 = static_cast(destination); + unsigned short* d16 = static_cast(destination); + + for (size_t i = 0; i < count; ++i) + { + const float* c = &data[i * 4]; + + int fr = meshopt_quantizeUnorm(c[0], bits); + int fg = meshopt_quantizeUnorm(c[1], bits); + int fb = meshopt_quantizeUnorm(c[2], bits); + + // YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers + int fco = (fr - fb) / 2; + int tmp = fb + fco; + int fcg = (fg - tmp) / 2; + int fy = tmp + fcg; + + // validate that R/G/B can be reconstructed with K bit integers + assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits)); + + // alpha: K-1-bit encoding with high bit set to 1 + int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1)); + + if (stride == 4) + { + d8[i * 4 + 0] = (unsigned char)(fy); + d8[i * 4 + 1] = (unsigned char)(fco); + d8[i * 4 + 2] = (unsigned char)(fcg); + d8[i * 4 + 3] = (unsigned char)(fa); + } + else + { + d16[i * 4 + 0] = (unsigned short)(fy); + d16[i * 4 + 1] = (unsigned short)(fco); + d16[i * 4 + 2] = (unsigned short)(fcg); + d16[i * 4 + 3] = (unsigned short)(fa); + } + } +} + #undef SIMD_SSE #undef SIMD_NEON #undef SIMD_WASM diff --git a/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp b/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp deleted file mode 100644 index 51dca873f..000000000 --- a/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details -#include "meshoptimizer.h" - -#include -#include - -meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size) -{ - assert(index_count % 3 == 0); - assert(vertex_size > 0 && vertex_size <= 256); - - meshopt_Allocator allocator; - - meshopt_VertexFetchStatistics result = {}; - - unsigned char* vertex_visited = allocator.allocate(vertex_count); - memset(vertex_visited, 0, vertex_count); - - const size_t kCacheLine = 64; - const size_t kCacheSize = 128 * 1024; - - // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway - size_t cache[kCacheSize / kCacheLine] = {}; - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices[i]; - assert(index < vertex_count); - - vertex_visited[index] = 1; - - size_t start_address = index * vertex_size; - size_t end_address = start_address + vertex_size; - - size_t start_tag = start_address / kCacheLine; - size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine; - - assert(start_tag < end_tag); - - for (size_t tag = start_tag; tag < end_tag; ++tag) - { - size_t line = tag % (sizeof(cache) / sizeof(cache[0])); - - // we store +1 since cache is filled with 0 by default - result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine; - cache[line] = tag + 1; - } - } - - size_t unique_vertex_count = 0; - - for (size_t i = 0; i < vertex_count; ++i) - unique_vertex_count += vertex_visited[i]; - - result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size); - - return result; -} diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs index 60be17f0b..ff16bd1c1 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs @@ -18,6 +18,23 @@ namespace Flax.Deps.Dependencies get => new[] { TargetPlatform.Windows }; } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -30,7 +47,7 @@ namespace Flax.Deps.Dependencies // Copy files foreach (var platform in options.Platforms) { - BuildStarted(platform); + BuildStarted(platform, TargetArchitecture.x64); var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.lib"), Path.Combine(depsFolder, "amd_ags_x64.lib")); Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.dll"), Path.Combine(depsFolder, "amd_ags_x64.dll")); diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs index bb1c4fa3c..629a69070 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; +using System.Linq; using Flax.Build; namespace Flax.Deps.Dependencies @@ -39,6 +40,36 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -91,22 +122,22 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - var configuration = "Release"; - var binariesWin = new[] + BuildStarted(platform, architecture); + switch (platform) { - Path.Combine("bin", configuration, "assimp-vc140-md.dll"), - Path.Combine("lib", configuration, "assimp-vc140-md.lib"), - }; + case TargetPlatform.Windows: + { + var configuration = "Release"; + var binariesWin = new[] + { + Path.Combine("bin", configuration, "assimp-vc140-md.dll"), + Path.Combine("lib", configuration, "assimp-vc140-md.lib"), + }; - // Build for Windows - File.Delete(Path.Combine(root, "CMakeCache.txt")); - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) - { + // Build for Windows + File.Delete(Path.Combine(root, "CMakeCache.txt")); var buildDir = Path.Combine(root, "build-" + architecture); var solutionPath = Path.Combine(buildDir, "Assimp.sln"); SetupDirectory(buildDir, true); @@ -116,42 +147,40 @@ namespace Flax.Deps.Dependencies var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in binariesWin) Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, Path.GetFileName(file))); + break; } - - break; - } - case TargetPlatform.Linux: - { - var envVars = new Dictionary + case TargetPlatform.Linux: { - { "CC", "clang-" + Configuration.LinuxClangMinVer }, - { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, - { "CXX", "clang++-" + Configuration.LinuxClangMinVer }, - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; + var envVars = new Dictionary + { + { "CC", "clang-" + Configuration.LinuxClangMinVer }, + { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, + { "CXX", "clang-" + Configuration.LinuxClangMinVer }, + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; - // Build for Linux - RunCmake(root, platform, TargetArchitecture.x64, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars); - Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars); - configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h"); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a")); - break; - } - case TargetPlatform.Mac: - { - // Build for Mac - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + // Build for Linux + File.Delete(Path.Combine(root, "CMakeCache.txt")); + RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars); + Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars); + configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h"); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a")); + break; + } + case TargetPlatform.Mac: { + // Build for Mac + File.Delete(Path.Combine(root, "CMakeCache.txt")); RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig); Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool); configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h"); var depsFolder = GetThirdPartyFolder(options, platform, architecture); Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a")); Utilities.Run("make", "clean", null, root, Utilities.RunOptions.DefaultTool); + break; + } } - break; - } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs index e631b280b..0da78e580 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs @@ -28,6 +28,24 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -46,12 +64,12 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) + { + case TargetPlatform.Windows: { Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString()); var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture); @@ -61,7 +79,7 @@ namespace Flax.Deps.Dependencies } } break; - } + } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs index 894af3840..f74494a30 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs @@ -1,6 +1,5 @@ // Copyright (c) Wojciech Figat. All rights reserved. -using System; using System.IO; using System.Linq; using Flax.Build; @@ -31,22 +30,40 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - var sdk = WindowsPlatformBase.GetSDKs().Last(); - var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um"); - string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString()); - - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { + case TargetPlatform.Windows: + { + var sdk = WindowsPlatformBase.GetSDKs().Last(); + var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um"); + string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString()); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); string dxilLocation = @$"{binLocation}\{architecture}\dxil.dll"; @@ -60,9 +77,9 @@ namespace Flax.Deps.Dependencies string d3dcompilerLibLocation = @$"{sdkLibLocation}\{architecture}\d3dcompiler.lib"; Utilities.FileCopy(dxcompilerLibLocation, Path.Combine(depsFolder, Path.GetFileName(dxcompilerLibLocation))); Utilities.FileCopy(d3dcompilerLibLocation, Path.Combine(depsFolder, "d3dcompiler_47.lib")); + break; + } } - break; - } } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs index c0d1a461f..cfbb88870 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs @@ -30,6 +30,30 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + return new[] + { + TargetArchitecture.x64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -47,44 +71,44 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln"); - var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10"); - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { + case TargetPlatform.Windows: + { + var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln"); + var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10"); Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString()); var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in outputFileNames) Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), configuration, file), Path.Combine(depsFolder, file)); + break; + } + case TargetPlatform.UWP: + { + var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln"); + var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019"); + Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64"); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + foreach (var file in outputFileNames) + Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file)); + break; + } + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + { + var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln"); + var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022"); + var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64"; + Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + foreach (var file in outputFileNames) + Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file)); + break; + } } - break; - } - case TargetPlatform.UWP: - { - var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln"); - var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019"); - Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64"); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var file in outputFileNames) - Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file)); - break; - } - case TargetPlatform.XboxOne: - case TargetPlatform.XboxScarlett: - { - var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln"); - var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022"); - var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64"; - Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var file in outputFileNames) - Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file)); - break; - } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs new file mode 100644 index 000000000..3f9a2148b --- /dev/null +++ b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs @@ -0,0 +1,92 @@ +// Copyright (c) Wojciech Figat. All rights reserved. + +using System.IO; +using System.IO.Compression; +using Flax.Build; + +namespace Flax.Deps.Dependencies +{ + /// + /// Visual Studio EnvDTE COM library. https://learn.microsoft.com/en-us/dotnet/api/envdte?view=visualstudiosdk-2022 + /// + /// + class EnvDTE : Dependency + { + /// + public override TargetPlatform[] Platforms + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetPlatform.Windows, + }; + default: return new TargetPlatform[0]; + } + } + } + + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + + /// + public override void Build(BuildOptions options) + { + options.IntermediateFolder.Replace("/" + GetType().Name, "/Microsoft.VisualStudio.Setup.Configuration.Native"); + + // Get the source + var root = options.IntermediateFolder; + var packagePath = Path.Combine(root, $"package.zip"); + if (!File.Exists(packagePath)) + { + Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/Microsoft.VisualStudio.Setup.Configuration.Native/3.14.2075", packagePath); + } + var extractedPath = Path.Combine(root, "extracted"); + if (!Directory.Exists(extractedPath)) + { + using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read)) + archive.ExtractToDirectory(extractedPath); + } + root = extractedPath; + + foreach (var platform in options.Platforms) + { + foreach (var architecture in options.Architectures) + { + BuildStarted(platform, architecture); + switch (platform) + { + case TargetPlatform.Windows: + { + var bin = Path.Combine(root, "lib", "native", "v141", architecture.ToString().ToLower()); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + Utilities.FileCopy(Path.Combine(bin, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"), Path.Combine(depsFolder, "Microsoft.VisualStudio.Setup.Configuration.Native.lib")); + + var include = Path.Combine(root, "lib", "native", "include"); + Utilities.FileCopy(Path.Combine(include, "Setup.Configuration.h"), Path.Combine(options.ThirdPartyFolder, "Microsoft.VisualStudio.Setup.Configuration.Native", "Setup.Configuration.h")); + break; + } + } + } + } + } + } +} diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs index 495de4734..58fb21b25 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs @@ -36,6 +36,24 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs index 1120a94f8..f3c3ff210 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs @@ -1,5 +1,6 @@ // Copyright (c) Wojciech Figat. All rights reserved. +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -16,40 +17,6 @@ namespace Flax.Deps.Dependencies { private string root, nvCloth; - /// - public override TargetPlatform[] Platforms - { - get - { - switch (BuildPlatform) - { - case TargetPlatform.Windows: - return new[] - { - TargetPlatform.Windows, - TargetPlatform.XboxOne, - TargetPlatform.XboxScarlett, - TargetPlatform.PS4, - TargetPlatform.PS5, - TargetPlatform.Switch, - TargetPlatform.Android, - }; - case TargetPlatform.Linux: - return new[] - { - TargetPlatform.Linux, - }; - case TargetPlatform.Mac: - return new[] - { - TargetPlatform.Mac, - TargetPlatform.iOS, - }; - default: return new TargetPlatform[0]; - } - } - } - /// public override void Build(BuildOptions options) { @@ -59,41 +26,51 @@ namespace Flax.Deps.Dependencies // Get the source CloneGitRepoSingleBranch(root, "https://github.com/FlaxEngine/NvCloth.git", "master"); + // Patch the CMakeLists.txt to support custom compilation flags + foreach (var os in new[] { "android", "ios", "linux", "mac", "windows", }) + { + var filePath = Path.Combine(nvCloth, "compiler", "cmake", os, "CMakeLists.txt"); + var appendLine = "SET(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${NVCLOTH_CXX_FLAGS}\")"; + if (!File.ReadAllText(filePath).Contains(appendLine)) + File.AppendAllText(filePath, Environment.NewLine + appendLine + Environment.NewLine); + } + foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - Build(options, platform, TargetArchitecture.x64); - Build(options, platform, TargetArchitecture.ARM64); - break; - case TargetPlatform.XboxOne: - case TargetPlatform.XboxScarlett: - Build(options, platform, TargetArchitecture.x64); - break; - case TargetPlatform.PS4: - case TargetPlatform.PS5: - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true); - Build(options, platform, TargetArchitecture.x64); - break; - case TargetPlatform.Switch: - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true); - Build(options, platform, TargetArchitecture.ARM64); - break; - case TargetPlatform.Android: - Build(options, platform, TargetArchitecture.ARM64); - break; - case TargetPlatform.Mac: - Build(options, platform, TargetArchitecture.x64); - Build(options, platform, TargetArchitecture.ARM64); - break; - case TargetPlatform.iOS: - Build(options, platform, TargetArchitecture.ARM64); - break; - case TargetPlatform.Linux: - Build(options, platform, TargetArchitecture.x64); - break; + BuildStarted(platform, architecture); + switch (platform) + { + case TargetPlatform.Windows: + Build(options, platform, architecture); + break; + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + Build(options, platform, TargetArchitecture.x64); + break; + case TargetPlatform.PS4: + case TargetPlatform.PS5: + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true); + Build(options, platform, TargetArchitecture.x64); + break; + case TargetPlatform.Switch: + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true); + Build(options, platform, TargetArchitecture.ARM64); + break; + case TargetPlatform.Android: + Build(options, platform, TargetArchitecture.ARM64); + break; + case TargetPlatform.Mac: + Build(options, platform, architecture); + break; + case TargetPlatform.iOS: + Build(options, platform, TargetArchitecture.ARM64); + break; + case TargetPlatform.Linux: + Build(options, platform, architecture); + break; + } } } @@ -110,7 +87,7 @@ namespace Flax.Deps.Dependencies // Peek options var binariesPrefix = string.Empty; var binariesPostfix = string.Empty; - var cmakeArgs = "-DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0"; + var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0"; var cmakeName = string.Empty; var buildFolder = Path.Combine(nvCloth, "compiler", platform.ToString() + '_' + architecture.ToString()); var envVars = new Dictionary(); @@ -154,7 +131,7 @@ namespace Flax.Deps.Dependencies } break; case TargetPlatform.Mac: - cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac"; + cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\""; cmakeName = "mac"; binariesPrefix = "lib"; break; @@ -164,7 +141,7 @@ namespace Flax.Deps.Dependencies binariesPrefix = "lib"; break; case TargetPlatform.Linux: - cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux"; + cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\""; cmakeName = "linux"; binariesPrefix = "lib"; envVars.Add("CC", "clang-" + Configuration.LinuxClangMinVer); diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs index 319ad70b3..37e446ce1 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs @@ -1,5 +1,5 @@ // Copyright (c) Wojciech Figat. All rights reserved. - +//#define USE_GIT_REPOSITORY using System; using System.Collections.Generic; using System.IO; @@ -45,132 +45,75 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.iOS: + return new[] + { + TargetArchitecture.ARM64, + }; + case TargetPlatform.Android: + return new[] + { + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { var root = options.IntermediateFolder; var version = "1.24.3"; var configuration = "Release"; + var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5"; var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL"); var noSSL = true; // OpenAL Soft website has broken certs - foreach (var platform in options.Platforms) - { - BuildStarted(platform); - switch (platform) - { - case TargetPlatform.Windows: - { - var binariesToCopy = new[] - { - "OpenAL32.lib", - "OpenAL32.dll", - }; - - // Get the source - CloneGitRepo(root, "https://github.com/kcat/openal-soft.git"); - GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3 - - // Build for Win64 and ARM64 - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) - { - var buildDir = Path.Combine(root, "build-" + architecture.ToString()); - var solutionPath = Path.Combine(buildDir, "OpenAL.sln"); - - RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\""); - Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString()); - var depsFolder = GetThirdPartyFolder(options, platform, architecture); - foreach (var file in binariesToCopy) - Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file))); - } - -#if false - // Get the binaries - var packagePath = Path.Combine(root, "package.zip"); - if (!File.Exists(packagePath)) - Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL); - using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read)) - { - if (!Directory.Exists(root)) - archive.ExtractToDirectory(root); - root = Path.Combine(root, archive.Entries.First().FullName); - } - - // Deploy Win64 binaries - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll")); - Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib")); - - // Deploy license - Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true); - - // Deploy header files - var files = Directory.GetFiles(Path.Combine(root, "include", "AL")); - foreach (var file in files) - { - Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file))); - } +#if !USE_GIT_REPOSITORY + if (options.Platforms.Contains(TargetPlatform.Windows)) #endif - break; - } - case TargetPlatform.Linux: + { + // Get the source + CloneGitRepo(root, "https://github.com/kcat/openal-soft.git"); + GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3 + } +#if !USE_GIT_REPOSITORY + else + { + // Get the source + var packagePath = Path.Combine(root, $"package-{version}.zip"); + if (!File.Exists(packagePath)) { - var binariesToCopy = new[] - { - "libopenal.a", - }; - var envVars = new Dictionary - { - { "CC", "clang-" + Configuration.LinuxClangMinVer }, - { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, - { "CXX", "clang++-" + Configuration.LinuxClangMinVer }, - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var config = $"-DALSOFT_REQUIRE_ALSA=ON " + - $"-DALSOFT_REQUIRE_OSS=ON " + - $"-DALSOFT_REQUIRE_PORTAUDIO=ON " + - $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " + - $"-DALSOFT_REQUIRE_JACK=ON " + - $"-DALSOFT_REQUIRE_PIPEWIRE=ON " + - $"-DALSOFT_EMBED_HRTF_DATA=YES "; - - // Get the source - var packagePath = Path.Combine(root, "package.zip"); - File.Delete(packagePath); - Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL); - Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput); - - // Use separate build directory - root = Path.Combine(root, "openal-soft-" + version); - var buildDir = Path.Combine(root, "build"); - SetupDirectory(buildDir, true); - - // Build for Linux - Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars); - BuildCmake(buildDir, configuration, envVars); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var file in binariesToCopy) - Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); - break; - } - case TargetPlatform.Android: - { - var binariesToCopy = new[] - { - "libopenal.a", - }; - var envVars = new Dictionary - { - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES"; - - // Get the source - var packagePath = Path.Combine(root, "package.zip"); - File.Delete(packagePath); Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL); if (Platform.BuildTargetPlatform == TargetPlatform.Windows) { + // TODO: Maybe use PowerShell Expand-Archive instead? var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe"); Utilities.Run(sevenZip, "x package.zip", null, root); Utilities.Run(sevenZip, "x package", null, root); @@ -179,89 +122,167 @@ namespace Flax.Deps.Dependencies { Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput); } - - // Use separate build directory - root = Path.Combine(root, "openal-soft-" + version); - var buildDir = Path.Combine(root, "build"); - SetupDirectory(buildDir, true); - - // Build - RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); - BuildCmake(buildDir, envVars); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - foreach (var file in binariesToCopy) - Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); - break; } - case TargetPlatform.Mac: + } +#endif + + foreach (var platform in options.Platforms) + { + foreach (var architecture in options.Architectures) { - var binariesToCopy = new[] + BuildStarted(platform, architecture); + switch (platform) { - "libopenal.a", - }; - var envVars = new Dictionary + case TargetPlatform.Windows: { - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES"; + var binariesToCopy = new[] + { + "OpenAL32.lib", + "OpenAL32.dll", + }; - // Get the source - var packagePath = Path.Combine(root, "package.zip"); - File.Delete(packagePath); - Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL); - Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput); - - // Use separate build directory - root = Path.Combine(root, "openal-soft-" + version); - var buildDir = Path.Combine(root, "build"); - - // Build for Mac - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) - { + // Build for Windows + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + var solutionPath = Path.Combine(buildDir, "OpenAL.sln"); SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, architecture, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" " + cmakeArgs); + Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString()); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + foreach (var file in binariesToCopy) + Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file))); + break; + } + case TargetPlatform.Linux: + { + var binariesToCopy = new[] + { + "libopenal.a", + }; + var envVars = new Dictionary + { + { "CC", "clang-" + Configuration.LinuxClangMinVer }, + { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, + { "CXX", "clang++-" + Configuration.LinuxClangMinVer }, + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + var config = $"-DALSOFT_REQUIRE_ALSA=ON " + + $"-DALSOFT_REQUIRE_OSS=ON " + + $"-DALSOFT_REQUIRE_PORTAUDIO=ON " + + $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " + + $"-DALSOFT_REQUIRE_JACK=ON " + + $"-DALSOFT_REQUIRE_PIPEWIRE=ON " + + $"-DALSOFT_EMBED_HRTF_DATA=YES " + + cmakeArgs; + + // Use separate build directory +#if !USE_GIT_REPOSITORY + root = Path.Combine(root, "openal-soft-" + version); +#endif + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + SetupDirectory(buildDir, true); + + // Build for Linux + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); + BuildCmake(buildDir, configuration, envVars); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + foreach (var file in binariesToCopy) + Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); + break; + } + case TargetPlatform.Android: + { + var binariesToCopy = new[] + { + "libopenal.a", + }; + var envVars = new Dictionary + { + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + var config = "-DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs; + + // Use separate build directory +#if !USE_GIT_REPOSITORY + root = Path.Combine(root, "openal-soft-" + version); +#endif + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + SetupDirectory(buildDir, true); + + // Build + RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); + BuildCmake(buildDir, envVars); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + foreach (var file in binariesToCopy) + Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); + break; + } + case TargetPlatform.Mac: + { + var binariesToCopy = new[] + { + "libopenal.a", + }; + var envVars = new Dictionary + { + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs; + + // Use separate build directory +#if !USE_GIT_REPOSITORY + root = Path.Combine(root, "openal-soft-" + version); +#endif + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + SetupDirectory(buildDir, true); + + // Build for Mac + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); BuildCmake(buildDir, envVars); var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in binariesToCopy) Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); + break; } - break; - } - case TargetPlatform.iOS: - { - var binariesToCopy = new[] + case TargetPlatform.iOS: { - "libopenal.a", - }; - var envVars = new Dictionary - { - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES"; + var binariesToCopy = new[] + { + "libopenal.a", + }; + var envVars = new Dictionary + { + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs; - // Get the source - var packagePath = Path.Combine(root, "package.zip"); - if (!File.Exists(packagePath)) - { - Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL); - Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput); + // Use separate build directory +#if !USE_GIT_REPOSITORY + root = Path.Combine(root, "openal-soft-" + version); +#endif + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + SetupDirectory(buildDir, true); + + // Build for iOS + RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); + BuildCmake(buildDir, envVars); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + foreach (var file in binariesToCopy) + Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); + break; + } } - - // Use separate build directory - root = Path.Combine(root, "openal-soft-" + version); - var buildDir = Path.Combine(root, "build"); - - // Build for iOS - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars); - BuildCmake(buildDir, envVars); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - foreach (var file in binariesToCopy) - Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file)); - break; - } } } + + // Deploy license + Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true); + + // Deploy header files + var files = Directory.GetFiles(Path.Combine(root, "include", "AL")); + foreach (var file in files) + { + Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file))); + } } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs index 39f7ad975..18bb4e69f 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs @@ -17,40 +17,6 @@ namespace Flax.Deps.Dependencies /// class PhysX : Dependency { - /// - public override TargetPlatform[] Platforms - { - get - { - switch (BuildPlatform) - { - case TargetPlatform.Windows: - return new[] - { - TargetPlatform.Windows, - TargetPlatform.XboxOne, - TargetPlatform.PS4, - TargetPlatform.PS5, - TargetPlatform.XboxScarlett, - TargetPlatform.Android, - TargetPlatform.Switch, - }; - case TargetPlatform.Linux: - return new[] - { - TargetPlatform.Linux, - }; - case TargetPlatform.Mac: - return new[] - { - TargetPlatform.Mac, - TargetPlatform.iOS, - }; - default: return new TargetPlatform[0]; - } - } - } - private string root; private string projectGenDir; private string projectGenPath; @@ -65,8 +31,13 @@ namespace Flax.Deps.Dependencies if (cmakeSwitch.HasAttribute("name") && cmakeSwitch.Attributes["name"].Value == name) { cmakeSwitch.Attributes["value"].Value = value; + return; } } + var child = cmakeSwitches.OwnerDocument.CreateElement(cmakeSwitches.ChildNodes[0].Name); + child.SetAttribute("name", name); + child.SetAttribute("value", value); + cmakeSwitches.AppendChild(child); } private void Build(BuildOptions options, string preset, TargetPlatform targetPlatform, TargetArchitecture architecture) @@ -94,11 +65,14 @@ namespace Flax.Deps.Dependencies case TargetPlatform.Windows: if (architecture == TargetArchitecture.ARM64) { - // Windows ARM64 doesn't have GPU support, so avoid copying those DLLs around + // Windows ARM64 doesn't have precompiled files for GPU support, so avoid copying those DLLs around ConfigureCmakeSwitch(cmakeSwitches, "PX_COPY_EXTERNAL_DLL", "OFF"); ConfigureCmakeSwitch(cmakeParams, "PX_COPY_EXTERNAL_DLL", "OFF"); } break; + case TargetPlatform.Linux: + ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\""); + break; case TargetPlatform.Android: ConfigureCmakeSwitch(cmakeParams, "CMAKE_INSTALL_PREFIX", $"install/android-{Configuration.AndroidPlatformApi}/PhysX"); ConfigureCmakeSwitch(cmakeParams, "ANDROID_NATIVE_API_LEVEL", $"android-{Configuration.AndroidPlatformApi}"); @@ -106,6 +80,7 @@ namespace Flax.Deps.Dependencies break; case TargetPlatform.Mac: ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.MacOSXMinVer); + ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\""); break; case TargetPlatform.iOS: ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.iOSMinVer); @@ -122,10 +97,11 @@ namespace Flax.Deps.Dependencies string bits; string arch; string binariesSubDir; - string buildPlatform; + string buildPlatform = architecture == TargetArchitecture.x86 ? "Win32" : architecture.ToString(); bool suppressBitsPostfix = false; string binariesPrefix = string.Empty; var envVars = new Dictionary(); + envVars.Add("CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel); switch (architecture) { case TargetArchitecture.x86: @@ -146,15 +122,6 @@ namespace Flax.Deps.Dependencies break; default: throw new InvalidArchitectureException(architecture); } - switch (architecture) - { - case TargetArchitecture.x86: - buildPlatform = "Win32"; - break; - default: - buildPlatform = architecture.ToString(); - break; - } var msBuildProps = new Dictionary(); switch (targetPlatform) { @@ -385,60 +352,84 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - Build(options, "vc17win64", platform, TargetArchitecture.x64); - Build(options, "vc17win-arm64", platform, TargetArchitecture.ARM64); - break; - } - case TargetPlatform.Linux: - { - Build(options, "linux", platform, TargetArchitecture.x64); - break; - } - case TargetPlatform.PS4: - { - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); - Build(options, "ps4", platform, TargetArchitecture.x64); - break; - } - case TargetPlatform.PS5: - { - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); - Build(options, "ps5", platform, TargetArchitecture.x64); - break; - } - case TargetPlatform.XboxScarlett: - case TargetPlatform.XboxOne: - { - Build(options, "vc16win64", platform, TargetArchitecture.x64); - break; - } - case TargetPlatform.Android: - { - Build(options, "android", platform, TargetArchitecture.ARM64); - break; - } - case TargetPlatform.Switch: - { - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); - Build(options, "switch64", platform, TargetArchitecture.ARM64); - break; - } - case TargetPlatform.Mac: - { - Build(options, "mac64", platform, TargetArchitecture.x64); - Build(options, "mac-arm64", platform, TargetArchitecture.ARM64); - break; - } - case TargetPlatform.iOS: - { - Build(options, "ios64", platform, TargetArchitecture.ARM64); - break; - } + BuildStarted(platform, architecture); + switch (platform) + { + case TargetPlatform.Windows: + { + if (architecture == TargetArchitecture.x64 || architecture == TargetArchitecture.ARM64) + { + if (WindowsPlatform.GetToolsets().Any(x => x.Key == WindowsPlatformToolset.v145)) + { + try + { + Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture); + } + catch (Exception e) + { + Log.Warning($"Failed to generate VS2026 solution for PhysX, fallback to VS2022: {e.Message}"); + Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture); + } + } + else + Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture); + } + else + throw new InvalidArchitectureException(architecture); + break; + } + case TargetPlatform.Linux: + { + Build(options, "linux", platform, architecture); + break; + } + case TargetPlatform.PS4: + { + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); + Build(options, "ps4", platform, TargetArchitecture.x64); + break; + } + case TargetPlatform.PS5: + { + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); + Build(options, "ps5", platform, TargetArchitecture.x64); + break; + } + case TargetPlatform.XboxScarlett: + case TargetPlatform.XboxOne: + { + Build(options, "vc16win64", platform, TargetArchitecture.x64); + break; + } + case TargetPlatform.Android: + { + Build(options, "android", platform, TargetArchitecture.ARM64); + break; + } + case TargetPlatform.Switch: + { + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true); + Build(options, "switch64", platform, TargetArchitecture.ARM64); + break; + } + case TargetPlatform.Mac: + { + if (architecture == TargetArchitecture.x64) + Build(options, "mac64", platform, architecture); + else if (architecture == TargetArchitecture.ARM64) + Build(options, "mac-arm64", platform, architecture); + else + throw new InvalidArchitectureException(architecture); + break; + } + case TargetPlatform.iOS: + { + Build(options, "ios64", platform, TargetArchitecture.ARM64); + break; + } + } } } @@ -446,7 +437,7 @@ namespace Flax.Deps.Dependencies var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "PhysX"); Directory.GetFiles(dstIncludePath, "*.h", SearchOption.AllDirectories).ToList().ForEach(File.Delete); Utilities.FileCopy(Path.Combine(root, "LICENSE.md"), Path.Combine(dstIncludePath, "License.txt")); - Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath); + Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath, true, true); } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs index 617b82af0..19e314326 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs @@ -29,6 +29,24 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -47,23 +65,23 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - // Build for Win64 - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { + case TargetPlatform.Windows: + { + // Build for Windows Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString(), new Dictionary() { { "RestorePackagesConfig", "true" } }); var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture); foreach (var file in outputFileNames) { Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), "Release", file), Path.Combine(depsFolder, file)); } + break; + } } - break; - } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs b/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs new file mode 100644 index 000000000..84a6f4f8b --- /dev/null +++ b/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs @@ -0,0 +1,91 @@ +// Copyright (c) Wojciech Figat. All rights reserved. + +using System; +using System.IO; +using System.IO.Compression; +using System.Linq; +using Flax.Build; +using Flax.Build.Platforms; + +namespace Flax.Deps.Dependencies +{ + /// + /// WinPixEventRuntime. https://github.com/microsoft/PixEvents + /// + /// + class WinPixEventRuntime : Dependency + { + /// + public override TargetPlatform[] Platforms + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetPlatform.Windows, + }; + default: return new TargetPlatform[0]; + } + } + } + + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + + /// + public override void Build(BuildOptions options) + { + // Get the source + var root = options.IntermediateFolder; + var packagePath = Path.Combine(root, $"package.zip"); + if (!File.Exists(packagePath)) + { + Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/WinPixEventRuntime/1.0.240308001", packagePath); + } + var extractedPath = Path.Combine(root, "extracted"); + if (!Directory.Exists(extractedPath)) + { + using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read)) + archive.ExtractToDirectory(extractedPath); + } + root = extractedPath; + + foreach (var platform in options.Platforms) + { + foreach (var architecture in options.Architectures) + { + BuildStarted(platform, architecture); + switch (platform) + { + case TargetPlatform.Windows: + { + var bin = Path.Combine(root, "bin", architecture.ToString()); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + Utilities.FileCopy(Path.Combine(bin, "WinPixEventRuntime.dll"), Path.Combine(depsFolder, "WinPixEventRuntime.dll")); + Utilities.FileCopy(Path.Combine(bin, "WinPixEventRuntime.lib"), Path.Combine(depsFolder, "WinPixEventRuntime.lib")); + break; + } + } + } + } + } + } +} diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs index d5886810d..62a2b1097 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs @@ -1,6 +1,5 @@ // Copyright (c) Wojciech Figat. All rights reserved. -using System.Collections.Generic; using System.IO; using Flax.Build; @@ -34,6 +33,30 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -45,14 +68,14 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - - foreach (var architecture in new []{ TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { - string buildDir = Path.Combine(root, "build-" + architecture.ToString()); + case TargetPlatform.Windows: + { + string buildDir = Path.Combine(root, "build-" + architecture); var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON"; var lib = architecture == TargetArchitecture.ARM64 ? "astcenc-neon-static.lib" : "astcenc-sse2-static.lib"; SetupDirectory(buildDir, true); @@ -60,12 +83,11 @@ namespace Flax.Deps.Dependencies BuildCmake(buildDir); var depsFolder = GetThirdPartyFolder(options, platform, architecture); Utilities.FileCopy(Path.Combine(buildDir, "Source/Release", lib), Path.Combine(depsFolder, "astcenc.lib")); - } - break; - case TargetPlatform.Mac: - foreach (var architecture in new []{ TargetArchitecture.x64, TargetArchitecture.ARM64 }) + break; + } + case TargetPlatform.Mac: { - string buildDir = Path.Combine(root, "build-" + architecture.ToString()); + string buildDir = Path.Combine(root, "build-" + architecture); var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON"; var lib = architecture == TargetArchitecture.ARM64 ? "libastcenc-neon-static.a" : "libastcenc-sse2-static.a"; SetupDirectory(buildDir, true); @@ -73,8 +95,9 @@ namespace Flax.Deps.Dependencies BuildCmake(buildDir); var depsFolder = GetThirdPartyFolder(options, platform, architecture); Utilities.FileCopy(Path.Combine(buildDir, "Source", lib), Path.Combine(depsFolder, "libastcenc.a")); + break; + } } - break; } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs index 447f573a7..2d25fed3d 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs @@ -41,6 +41,36 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -69,15 +99,15 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - // Build for Win64 and ARM64 - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { - var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + case TargetPlatform.Windows: + { + // Build for Windows + var buildDir = Path.Combine(root, "build-" + architecture); var solutionPath = Path.Combine(buildDir, "CURL.sln"); RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_CURL_EXE=OFF -DBUILD_SHARED_LIBS=OFF -DCURL_STATIC_CRT=OFF"); @@ -85,57 +115,55 @@ namespace Flax.Deps.Dependencies var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in binariesToCopyWin) Utilities.FileCopy(Path.Combine(buildDir, "lib", configuration, file), Path.Combine(depsFolder, Path.GetFileName(file))); + break; } - break; - } - case TargetPlatform.Linux: - { - // Build for Linux - var settings = new[] + case TargetPlatform.Linux: { - "--without-librtmp", - "--without-ssl", - "--with-gnutls", - "--disable-ipv6", - "--disable-manual", - "--disable-verbose", - "--disable-shared", - "--enable-static", - "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb", - }; - var envVars = new Dictionary - { - { "CC", "clang-" + Configuration.LinuxClangMinVer }, - { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var buildDir = Path.Combine(root, "build"); - SetupDirectory(buildDir, true); - Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.DefaultTool); - Utilities.Run(Path.Combine(root, "configure"), string.Join(" ", settings) + " --prefix=\"" + buildDir + "\"", null, root, Utilities.RunOptions.DefaultTool, envVars); - Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool); - Utilities.Run("make", "install", null, root, Utilities.RunOptions.DefaultTool); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - var filename = "libcurl.a"; - Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename)); - break; - } - case TargetPlatform.Mac: - { - // Build for Mac - var settings = new[] - { - "--with-secure-transport", - "--without-librtmp", - "--disable-ipv6", - "--disable-manual", - "--disable-verbose", - "--disable-shared", - "--enable-static", - "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb", - }; - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + // Build for Linux + var settings = new[] + { + "--without-librtmp", + //"--without-ssl", + "--with-gnutls", + "--disable-ipv6", + "--disable-manual", + "--disable-verbose", + "--disable-shared", + "--enable-static", + "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb", + }; + var envVars = new Dictionary + { + { "CC", "clang-" + Configuration.LinuxClangMinVer }, + { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + var buildDir = Path.Combine(root, "build"); + SetupDirectory(buildDir, true); + Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.DefaultTool); + Utilities.Run(Path.Combine(root, "configure"), string.Join(" ", settings) + " --prefix=\"" + buildDir + "\"", null, root, Utilities.RunOptions.DefaultTool, envVars); + Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool); + Utilities.Run("make", "install", null, root, Utilities.RunOptions.DefaultTool); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + var filename = "libcurl.a"; + Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename)); + break; + } + case TargetPlatform.Mac: { + // Build for Mac + var settings = new[] + { + "--with-secure-transport", + "--without-librtmp", + "--disable-ipv6", + "--disable-manual", + "--disable-verbose", + "--disable-shared", + "--enable-static", + "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb", + }; + var arch = GetAppleArchName(architecture); var archName = arch + "-apple-darwin19"; if (architecture == TargetArchitecture.ARM64) @@ -162,9 +190,9 @@ namespace Flax.Deps.Dependencies var depsFolder = GetThirdPartyFolder(options, platform, architecture); var filename = "libcurl.a"; Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename)); + break; + } } - break; - } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs b/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs index 7017560fb..34fac56e0 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs @@ -30,27 +30,45 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - var sdk = WindowsPlatformBase.GetSDKs().Last(); - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { + case TargetPlatform.Windows: + { + var sdk = WindowsPlatformBase.GetSDKs().Last(); var depsFolder = GetThirdPartyFolder(options, platform, architecture); var libLocation = @$"{sdk.Value}Debuggers\lib\{architecture}\dbghelp.lib"; var dllLocation = @$"{sdk.Value}Debuggers\{architecture}\dbghelp.dll"; Utilities.FileCopy(libLocation, Path.Combine(depsFolder, Path.GetFileName(libLocation))); Utilities.FileCopy(dllLocation, Path.Combine(depsFolder, Path.GetFileName(dllLocation))); + break; + } } - break; - } } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs index 89ed09a72..d43c73770 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs @@ -15,40 +15,6 @@ namespace Flax.Deps.Dependencies /// class freetype : Dependency { - /// - public override TargetPlatform[] Platforms - { - get - { - switch (BuildPlatform) - { - case TargetPlatform.Windows: - return new[] - { - TargetPlatform.Windows, - TargetPlatform.XboxOne, - TargetPlatform.PS4, - TargetPlatform.PS5, - TargetPlatform.XboxScarlett, - TargetPlatform.Android, - TargetPlatform.Switch, - }; - case TargetPlatform.Linux: - return new[] - { - TargetPlatform.Linux, - }; - case TargetPlatform.Mac: - return new[] - { - TargetPlatform.Mac, - TargetPlatform.iOS, - }; - default: return new TargetPlatform[0]; - } - } - } - /// public override void Build(BuildOptions options) { @@ -94,171 +60,167 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - // Patch the RuntimeLibrary value - File.WriteAllText(vcxprojPath, vcxprojContents); - - // Build for Windows - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + BuildStarted(platform, architecture); + switch (platform) { + case TargetPlatform.Windows: + { + // Patch the RuntimeLibrary value + File.WriteAllText(vcxprojPath, vcxprojContents); + + // Build for Windows Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, architecture.ToString(), msvcProps); var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var filename in binariesToCopyMsvc) Utilities.FileCopy(Path.Combine(root, "objs", architecture.ToString(), configurationMsvc, filename), Path.Combine(depsFolder, filename)); + break; } - break; - } - case TargetPlatform.Linux: - { - var envVars = new Dictionary + case TargetPlatform.Linux: { - { "CC", "clang-" + Configuration.LinuxClangMinVer }, - { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; + var envVars = new Dictionary + { + { "CC", "clang-" + Configuration.LinuxClangMinVer }, + { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; - // Fix scripts - Utilities.Run("dos2unix", "autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - Utilities.Run("dos2unix", "configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - //Utilities.Run("sed", "-i -e \'s/\r$//\' autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - //Utilities.Run("sed", "-i -e \'s/\r$//\' configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - Utilities.Run("chmod", "+x autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError); - Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.ThrowExceptionOnError); + // Fix scripts + Utilities.Run("dos2unix", "autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); + Utilities.Run("dos2unix", "configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); + //Utilities.Run("sed", "-i -e \'s/\r$//\' autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); + //Utilities.Run("sed", "-i -e \'s/\r$//\' configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); + Utilities.Run("chmod", "+x autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError); + Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.ThrowExceptionOnError); - Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); + Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - // Disable using libpng even if it's found on the system - var cmakeFile = Path.Combine(root, "CMakeLists.txt"); - File.WriteAllText(cmakeFile, - File.ReadAllText(cmakeFile) - .Replace("find_package(PNG)", "") - .Replace("find_package(ZLIB)", "") - .Replace("find_package(BZip2)", "") - ); + // Disable using libpng even if it's found on the system + var cmakeFile = Path.Combine(root, "CMakeLists.txt"); + File.WriteAllText(cmakeFile, + File.ReadAllText(cmakeFile) + .Replace("find_package(PNG)", "") + .Replace("find_package(ZLIB)", "") + .Replace("find_package(BZip2)", "") + ); - // Build for Linux - SetupDirectory(buildDir, true); - var toolchain = UnixToolchain.GetToolchainName(platform, TargetArchitecture.x64); - Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars); - Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.DefaultTool, envVars); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); - - break; - } - case TargetPlatform.PS4: - { - // Get the build data files - Utilities.DirectoryCopy( - Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"), - Path.Combine(root, "builds", "PS4"), false, true); - - // Build for PS4 - var solutionPath = Path.Combine(root, "builds", "PS4", "freetype.sln"); - Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "ORBIS"); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - Utilities.FileCopy(Path.Combine(root, "lib", "PS4", libraryFileName), Path.Combine(depsFolder, libraryFileName)); - - break; - } - case TargetPlatform.PS5: - { - // Get the build data files - Utilities.DirectoryCopy( - Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"), - Path.Combine(root, "builds", "PS5"), false, true); - Utilities.ReplaceInFile(Path.Combine(root, "include\\freetype\\config\\ftstdlib.h"), "#define ft_getenv getenv", "char* ft_getenv(const char* n);"); - - // Build for PS5 - var solutionPath = Path.Combine(root, "builds", "PS5", "freetype.sln"); - Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "PROSPERO"); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - Utilities.FileCopy(Path.Combine(root, "lib", "PS5", libraryFileName), Path.Combine(depsFolder, libraryFileName)); - - break; - } - case TargetPlatform.XboxOne: - { - // Build for Xbox One x64 - Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var filename in binariesToCopyMsvc) - Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename)); - - break; - } - case TargetPlatform.XboxScarlett: - { - // Build for Xbox Scarlett - Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var filename in binariesToCopyMsvc) - Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename)); - - break; - } - case TargetPlatform.Android: - { - // Disable using libpng even if it's found on the system - var cmakeFile = Path.Combine(root, "CMakeLists.txt"); - File.WriteAllText(cmakeFile, - File.ReadAllText(cmakeFile) - .Replace("find_package(PNG)", "") - .Replace("find_package(ZLIB)", "") - .Replace("find_package(BZip2)", "") - ); - - // Build for Android - SetupDirectory(buildDir, true); - RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release"); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); - break; - } - case TargetPlatform.Switch: - { - // Build for Switch - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release"); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); - break; - } - case TargetPlatform.Mac: - { - // Build for Mac - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) - { + // Build for Linux SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, architecture, ".. -DCMAKE_BUILD_TYPE=Release"); + var toolchain = UnixToolchain.GetToolchainName(platform, architecture); + Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars); + Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.DefaultTool, envVars); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); + break; + } + case TargetPlatform.PS4: + { + // Get the build data files + Utilities.DirectoryCopy( + Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"), + Path.Combine(root, "builds", "PS4"), false, true); + + // Build for PS4 + var solutionPath = Path.Combine(root, "builds", "PS4", "freetype.sln"); + Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "ORBIS"); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + Utilities.FileCopy(Path.Combine(root, "lib", "PS4", libraryFileName), Path.Combine(depsFolder, libraryFileName)); + + break; + } + case TargetPlatform.PS5: + { + // Get the build data files + Utilities.DirectoryCopy( + Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"), + Path.Combine(root, "builds", "PS5"), false, true); + Utilities.ReplaceInFile(Path.Combine(root, "include\\freetype\\config\\ftstdlib.h"), "#define ft_getenv getenv", "char* ft_getenv(const char* n);"); + + // Build for PS5 + var solutionPath = Path.Combine(root, "builds", "PS5", "freetype.sln"); + Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "PROSPERO"); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + Utilities.FileCopy(Path.Combine(root, "lib", "PS5", libraryFileName), Path.Combine(depsFolder, libraryFileName)); + + break; + } + case TargetPlatform.XboxOne: + { + // Build for Xbox One x64 + Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + foreach (var filename in binariesToCopyMsvc) + Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename)); + + break; + } + case TargetPlatform.XboxScarlett: + { + // Build for Xbox Scarlett + Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); + foreach (var filename in binariesToCopyMsvc) + Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename)); + + break; + } + case TargetPlatform.Android: + { + // Disable using libpng even if it's found on the system + var cmakeFile = Path.Combine(root, "CMakeLists.txt"); + File.WriteAllText(cmakeFile, + File.ReadAllText(cmakeFile) + .Replace("find_package(PNG)", "") + .Replace("find_package(ZLIB)", "") + .Replace("find_package(BZip2)", "") + ); + + // Build for Android + SetupDirectory(buildDir, true); + RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release"); + BuildCmake(buildDir); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); + break; + } + case TargetPlatform.Switch: + { + // Build for Switch + SetupDirectory(buildDir, true); + RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release"); + BuildCmake(buildDir); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); + break; + } + case TargetPlatform.Mac: + { + // Build for Mac + SetupDirectory(buildDir, true); + RunCmake(buildDir, platform, architecture, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release"); BuildCmake(buildDir); var depsFolder = GetThirdPartyFolder(options, platform, architecture); Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); + break; } - break; - } - case TargetPlatform.iOS: - { - // Fix archive creation issue due to missing ar tool - Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_SYSTEM_NAME Darwin)", "set(CMAKE_SYSTEM_NAME Darwin)\nset(CMAKE_AR ar CACHE FILEPATH \"\" FORCE)"); + case TargetPlatform.iOS: + { + // Fix archive creation issue due to missing ar tool + Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_SYSTEM_NAME Darwin)", "set(CMAKE_SYSTEM_NAME Darwin)\nset(CMAKE_AR ar CACHE FILEPATH \"\" FORCE)"); - // Fix freetype toolchain rejecting min iPhone version - Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_OSX_DEPLOYMENT_TARGET \"\"", "set(CMAKE_OSX_DEPLOYMENT_TARGET \"${CMAKE_OSX_DEPLOYMENT_TARGET}\""); + // Fix freetype toolchain rejecting min iPhone version + Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_OSX_DEPLOYMENT_TARGET \"\"", "set(CMAKE_OSX_DEPLOYMENT_TARGET \"${CMAKE_OSX_DEPLOYMENT_TARGET}\""); - // Build for iOS - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF"); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); - break; - } + // Build for iOS + SetupDirectory(buildDir, true); + RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF"); + BuildCmake(buildDir); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName)); + break; + } + } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs index a876083f8..32c14a037 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs @@ -1,5 +1,6 @@ // Copyright (c) Wojciech Figat. All rights reserved. +using System; using System.IO; using Flax.Build; @@ -38,13 +39,43 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { var root = options.IntermediateFolder; var installDir = Path.Combine(root, "install"); var configuration = "Release"; - var cmakeArgs = string.Format("-DCMAKE_INSTALL_PREFIX=\"{0}\" -DCMAKE_BUILD_TYPE={1} -DENABLE_RTTI=ON -DENABLE_CTEST=OFF -DENABLE_HLSL=ON -DENABLE_SPVREMAPPER=ON -DENABLE_GLSLANG_BINARIES=OFF", installDir, configuration); + var cmakeArgs = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_INSTALL_PREFIX=\"{installDir}\" -DCMAKE_BUILD_TYPE={configuration} -DENABLE_RTTI=ON -DENABLE_CTEST=OFF -DENABLE_HLSL=ON -DENABLE_SPVREMAPPER=ON -DENABLE_GLSLANG_BINARIES=OFF"; var libsRoot = Path.Combine(installDir, "lib"); // Get the source @@ -52,97 +83,93 @@ namespace Flax.Deps.Dependencies // Setup the external sources // Requires distutils (pip install setuptools) - Utilities.Run("python", "update_glslang_sources.py", null, root, Utilities.RunOptions.ConsoleLogOutput); + if (Utilities.Run(BuildPlatform != TargetPlatform.Mac ? "python" : "python3", "update_glslang_sources.py", null, root, Utilities.RunOptions.ConsoleLogOutput) != 0) + throw new Exception("Failed to update glslang sources, make sure setuptools python package is installed."); foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - var outputFiles = new[] - { - Path.Combine(libsRoot, "GenericCodeGen.lib"), - Path.Combine(libsRoot, "MachineIndependent.lib"), - Path.Combine(libsRoot, "HLSL.lib"), - Path.Combine(libsRoot, "OSDependent.lib"), - Path.Combine(libsRoot, "OGLCompiler.lib"), - Path.Combine(libsRoot, "SPIRV-Tools-opt.lib"), - Path.Combine(libsRoot, "SPIRV-Tools.lib"), - Path.Combine(libsRoot, "SPIRV.lib"), - Path.Combine(libsRoot, "glslang.lib"), - }; + BuildStarted(platform, architecture); - // Build for Windows - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + switch (platform) { - var buildDir = Path.Combine(root, "build-" + architecture.ToString()); + case TargetPlatform.Windows: + { + var outputFiles = new[] + { + Path.Combine(libsRoot, "GenericCodeGen.lib"), + Path.Combine(libsRoot, "MachineIndependent.lib"), + Path.Combine(libsRoot, "HLSL.lib"), + Path.Combine(libsRoot, "OSDependent.lib"), + Path.Combine(libsRoot, "OGLCompiler.lib"), + Path.Combine(libsRoot, "SPIRV-Tools-opt.lib"), + Path.Combine(libsRoot, "SPIRV-Tools.lib"), + Path.Combine(libsRoot, "SPIRV.lib"), + Path.Combine(libsRoot, "glslang.lib"), + }; + + // Build for Windows var solutionPath = Path.Combine(buildDir, "glslang.sln"); - SetupDirectory(buildDir, false); - RunCmake(root, platform, architecture, cmakeArgs + $" -B\"{buildDir}\""); - Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput); + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs); Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString()); + Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput); var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in outputFiles) { Utilities.FileCopy(file, Path.Combine(depsFolder, Path.GetFileName(file))); } + break; } - break; - } - case TargetPlatform.Linux: - { - var outputFiles = new[] + case TargetPlatform.Linux: { - Path.Combine(libsRoot, "libGenericCodeGen.a"), - Path.Combine(libsRoot, "libMachineIndependent.a"), - Path.Combine(libsRoot, "libHLSL.a"), - Path.Combine(libsRoot, "libOSDependent.a"), - Path.Combine(libsRoot, "libOGLCompiler.a"), - Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"), - Path.Combine(libsRoot, "libSPIRV-Tools.a"), - Path.Combine(libsRoot, "libSPIRV.a"), - Path.Combine(libsRoot, "libglslang.a"), - }; - var buildDir = root; + var outputFiles = new[] + { + Path.Combine(libsRoot, "libGenericCodeGen.a"), + Path.Combine(libsRoot, "libMachineIndependent.a"), + Path.Combine(libsRoot, "libHLSL.a"), + Path.Combine(libsRoot, "libOSDependent.a"), + Path.Combine(libsRoot, "libOGLCompiler.a"), + Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"), + Path.Combine(libsRoot, "libSPIRV-Tools.a"), + Path.Combine(libsRoot, "libSPIRV.a"), + Path.Combine(libsRoot, "libglslang.a"), + }; - // Build for Linux - RunCmake(root, platform, TargetArchitecture.x64, cmakeArgs); - Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput); - Utilities.Run("make", null, null, root, Utilities.RunOptions.ConsoleLogOutput); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var file in outputFiles) - { - var dst = Path.Combine(depsFolder, Path.GetFileName(file)); - Utilities.FileCopy(file, dst); - //Utilities.Run("strip", string.Format("-s \"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput); + // Build for Linux + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs); + Utilities.Run("make", null, null, buildDir, Utilities.RunOptions.ConsoleLogOutput); + Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); + foreach (var file in outputFiles) + { + var dst = Path.Combine(depsFolder, Path.GetFileName(file)); + Utilities.FileCopy(file, dst); + //Utilities.Run("strip", string.Format("-s \"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput); + } + break; } - break; - } - case TargetPlatform.Mac: - { - var outputFiles = new[] + case TargetPlatform.Mac: { - Path.Combine(libsRoot, "libGenericCodeGen.a"), - Path.Combine(libsRoot, "libMachineIndependent.a"), - Path.Combine(libsRoot, "libHLSL.a"), - Path.Combine(libsRoot, "libOSDependent.a"), - Path.Combine(libsRoot, "libOGLCompiler.a"), - Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"), - Path.Combine(libsRoot, "libSPIRV-Tools.a"), - Path.Combine(libsRoot, "libSPIRV.a"), - Path.Combine(libsRoot, "libglslang.a"), - }; - var buildDir = root; + var outputFiles = new[] + { + Path.Combine(libsRoot, "libGenericCodeGen.a"), + Path.Combine(libsRoot, "libMachineIndependent.a"), + Path.Combine(libsRoot, "libHLSL.a"), + Path.Combine(libsRoot, "libOSDependent.a"), + Path.Combine(libsRoot, "libOGLCompiler.a"), + Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"), + Path.Combine(libsRoot, "libSPIRV-Tools.a"), + Path.Combine(libsRoot, "libSPIRV.a"), + Path.Combine(libsRoot, "libglslang.a"), + }; - // Build for Mac - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) - { - RunCmake(root, platform, architecture, cmakeArgs); - Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput); - Utilities.Run("make", null, null, root, Utilities.RunOptions.ConsoleLogOutput); + // Build for Mac + RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs); + Utilities.Run("make", null, null, buildDir, Utilities.RunOptions.ConsoleLogOutput); + Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput); var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in outputFiles) { @@ -150,9 +177,9 @@ namespace Flax.Deps.Dependencies Utilities.FileCopy(file, dst); Utilities.Run("strip", string.Format("\"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput); } + break; + } } - break; - } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs index 57d2f74fe..a90d1c2a0 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs @@ -53,6 +53,48 @@ namespace Flax.Deps.Dependencies } } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + return new[] + { + TargetArchitecture.x64, + }; + case TargetPlatform.Switch: + case TargetPlatform.Android: + return new[] + { + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } + private string root; private string monoPropsPath; private string monoPreprocesorDefines; diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs index 66909f6b9..f67244c9b 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs @@ -43,6 +43,9 @@ namespace Flax.Deps.Dependencies } } + /// + public override bool BuildByDefault => false; + private string root; private bool cleanArtifacts; @@ -349,24 +352,27 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - var platformData = Path.Combine(GetBinariesFolder(options, platform), "Data", "nethost"); - if (Directory.Exists(platformData)) - Utilities.DirectoryCopy(platformData, root, true, true); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.PS4: - case TargetPlatform.PS5: - case TargetPlatform.XboxOne: - case TargetPlatform.XboxScarlett: - Build(options, platform, TargetArchitecture.x64); + BuildStarted(platform, architecture); + var platformData = Path.Combine(GetBinariesFolder(options, platform), "Data", "nethost"); + if (Directory.Exists(platformData)) + Utilities.DirectoryCopy(platformData, root, true, true); + switch (platform) + { + case TargetPlatform.PS4: + case TargetPlatform.PS5: + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + Build(options, platform, TargetArchitecture.x64); break; - case TargetPlatform.Android: - Build(options, platform, TargetArchitecture.ARM64); + case TargetPlatform.Android: + Build(options, platform, TargetArchitecture.ARM64); break; - case TargetPlatform.Switch: - Build(options, platform, TargetArchitecture.ARM64); + case TargetPlatform.Switch: + Build(options, platform, TargetArchitecture.ARM64); break; + } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs b/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs index d1d94b4c1..6f18a9190 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs @@ -18,6 +18,23 @@ namespace Flax.Deps.Dependencies get => new[] { TargetPlatform.Windows }; } + /// + public override TargetArchitecture[] Architectures + { + get + { + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64 + }; + default: return new TargetArchitecture[0]; + } + } + } + /// public override void Build(BuildOptions options) { @@ -30,7 +47,7 @@ namespace Flax.Deps.Dependencies // Copy files foreach (var platform in options.Platforms) { - BuildStarted(platform); + BuildStarted(platform, TargetArchitecture.x64); var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); Utilities.FileCopy(Path.Combine(root, "amd64/nvapi64.lib"), Path.Combine(depsFolder, "nvapi64.lib")); } diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs index d22f8696f..15ca415da 100644 --- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs +++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs @@ -15,56 +15,24 @@ namespace Flax.Deps.Dependencies /// class vorbis : Dependency { - /// - public override TargetPlatform[] Platforms - { - get - { - switch (BuildPlatform) - { - case TargetPlatform.Windows: - return new[] - { - TargetPlatform.Windows, - TargetPlatform.XboxOne, - TargetPlatform.PS4, - TargetPlatform.PS5, - TargetPlatform.XboxScarlett, - TargetPlatform.Android, - TargetPlatform.Switch, - }; - case TargetPlatform.Linux: - return new[] - { - TargetPlatform.Linux, - }; - case TargetPlatform.Mac: - return new[] - { - TargetPlatform.Mac, - TargetPlatform.iOS, - }; - default: return new TargetPlatform[0]; - } - } - } - private struct Binary { public string Filename; public string SrcFolder; + public string DstFilename; - public Binary(string filename, string srcFolder) + public Binary(string filename, string srcFolder, string dstFilename = null) { Filename = filename; SrcFolder = srcFolder; + DstFilename = dstFilename; } } private bool hasSourcesReady; private string root; private string rootMsvcLib; - private string configurationMsvc; + private string _configuration = "Release"; private List vcxprojContentsWindows; private string[] vcxprojPathsWindows; @@ -74,22 +42,6 @@ namespace Flax.Deps.Dependencies new Binary("libvorbisfile_static.lib", "libvorbisfile"), }; - private (string, string)[] vorbisBinariesToCopyWindowsCmake = - { - ("vorbis.lib", "libvorbis_static.lib"), - ("vorbisfile.lib", "libvorbisfile_static.lib"), - }; - - private Binary[] oggBinariesToCopyWindows = - { - new Binary("libogg_static.lib", "ogg"), - }; - - private (string, string)[] oggBinariesToCopyWindowsCmake = - { - ("ogg.lib", "libogg_static.lib"), - }; - private void PatchWindowsTargetPlatformVersion(string windowsTargetPlatformVersion, string platformToolset) { // Fix the MSVC project settings for Windows @@ -107,7 +59,6 @@ namespace Flax.Deps.Dependencies return; hasSourcesReady = true; - configurationMsvc = "Release"; string oggRoot = Path.Combine(root, "libogg"); string vorbisRoot = Path.Combine(root, "libvorbis"); @@ -197,7 +148,7 @@ namespace Flax.Deps.Dependencies break; default: throw new InvalidArchitectureException(architecture); } - binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc)))); + binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration)))); break; } case TargetPlatform.PS4: @@ -216,7 +167,7 @@ namespace Flax.Deps.Dependencies buildDir, true, true); Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "ogg", "ogg", "config_types.h"), Path.Combine(root, "libogg", "include", "ogg", "config_types.h")); - binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc)))); + binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration)))); break; } case TargetPlatform.PS5: @@ -237,7 +188,7 @@ namespace Flax.Deps.Dependencies Utilities.FileCopy( Path.Combine(GetBinariesFolder(options, platform), "Data", "ogg", "ogg", "config_types.h"), Path.Combine(root, "libogg", "include", "ogg", "config_types.h")); - binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc)))); + binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration)))); break; } case TargetPlatform.XboxOne: @@ -245,21 +196,21 @@ namespace Flax.Deps.Dependencies vcxprojPaths = vcxprojPathsWindows; buildPlatform = "x64"; PatchWindowsTargetPlatformVersion("10.0", "v143"); - binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc)))); + binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration)))); break; case TargetPlatform.XboxScarlett: buildDir = Path.Combine(rootMsvcLib, "win32", "VS2010"); vcxprojPaths = vcxprojPathsWindows; buildPlatform = "x64"; PatchWindowsTargetPlatformVersion("10.0", "v143"); - binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc)))); + binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration)))); break; default: throw new InvalidPlatformException(platform); } // Build foreach (var vcxprojPath in vcxprojPaths) - Deploy.VCEnvironment.BuildSolution(vcxprojPath, configurationMsvc, buildPlatform); + Deploy.VCEnvironment.BuildSolution(vcxprojPath, _configuration, buildPlatform); // Copy binaries var depsFolder = GetThirdPartyFolder(options, platform, architecture); @@ -273,48 +224,109 @@ namespace Flax.Deps.Dependencies string oggRoot = Path.Combine(root, "libogg"); string vorbisRoot = Path.Combine(root, "libvorbis"); - var oggBuildDir = Path.Combine(oggRoot, "build-" + architecture.ToString()); var vorbisBuildDir = Path.Combine(vorbisRoot, "build-" + architecture.ToString()); + var installDir = Path.Combine(root, "install"); string ext; + string oggConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\""; + string vorbisConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\""; + string liboggFilename = "libogg"; + Dictionary envVars = new Dictionary(); + (string, string)[] oggBinariesToCopy; + Binary[] vorbisBinariesToCopy; switch (platform) { case TargetPlatform.Windows: case TargetPlatform.UWP: case TargetPlatform.XboxOne: + oggConfig += " -DBUILD_SHARED_LIBS=OFF"; + vorbisConfig += " -DBUILD_SHARED_LIBS=OFF"; ext = ".lib"; + liboggFilename = "ogg"; break; case TargetPlatform.Linux: + oggConfig += " -DCMAKE_POSITION_INDEPENDENT_CODE=ON"; + vorbisConfig += " -DCMAKE_POSITION_INDEPENDENT_CODE=ON"; + envVars = new Dictionary + { + { "CC", "clang-" + Configuration.LinuxClangMinVer }, + { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, + { "CXX", "clang++-" + Configuration.LinuxClangMinVer }, + { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, + }; + ext = ".a"; + break; + case TargetPlatform.Mac: + //oggConfig += $" -DOGG_INCLUDE_DIR=\"{oggRoot}/install/include\" -DOGG_LIBRARY=\"{oggRoot}/install/lib\""; ext = ".a"; break; default: throw new InvalidPlatformException(platform); } - var binariesToCopy = new List<(string, string)>(); - - // Build ogg + switch (platform) { - var solutionPath = Path.Combine(oggBuildDir, "ogg.sln"); - - RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" -DBUILD_SHARED_LIBS=OFF"); - Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString()); - foreach (var file in oggBinariesToCopyWindowsCmake) - binariesToCopy.Add((Path.Combine(oggBuildDir, configurationMsvc, file.Item1), file.Item2)); + case TargetPlatform.Windows: + case TargetPlatform.UWP: + case TargetPlatform.XboxOne: + oggBinariesToCopy = + [ + ("ogg.lib", "libogg_static.lib") + ]; + vorbisBinariesToCopy = + [ + new Binary("vorbis.lib", "libvorbis", "libvorbis_static.lib"), + new Binary("vorbisfile.lib", "libvorbisfile", "libvorbisfile_static.lib") + ]; + break; + case TargetPlatform.Linux: + case TargetPlatform.Mac: + oggBinariesToCopy = + [ + ("libogg.a", "libogg.a") + ]; + vorbisBinariesToCopy = + [ + new Binary("libvorbis.a", "lib"), + new Binary("libvorbisenc.a", "lib"), + new Binary("libvorbisfile.a", "lib") + ]; + break; + default: throw new InvalidPlatformException(platform); } + vorbisConfig += $" -DOGG_INCLUDE_DIR=\"{Path.Combine(installDir, "include")}\" -DOGG_LIBRARY=\"{Path.Combine(installDir, "lib", liboggFilename + ext)}\""; + + var binariesToCopy = new List<(string, string)>(); + + SetupDirectory(installDir, true); + // Build ogg + { + SetupDirectory(oggBuildDir, true); + RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" " + oggConfig, envVars); + if (platform == TargetPlatform.Windows) + Deploy.VCEnvironment.BuildSolution(Path.Combine(oggBuildDir, "ogg.sln"), _configuration, architecture.ToString()); + else + BuildCmake(oggBuildDir); + Utilities.Run("cmake", $"--build . --config {_configuration} --target install", null, oggBuildDir, Utilities.RunOptions.DefaultTool); + } // Build vorbis { - var oggLibraryPath = Path.Combine(oggBuildDir, configurationMsvc, "ogg" + ext); - var solutionPath = Path.Combine(vorbisBuildDir, "vorbis.sln"); - - RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" -DOGG_INCLUDE_DIR=\"{Path.Combine(oggRoot, "include")}\" -DOGG_LIBRARY=\"{oggLibraryPath}\" -DBUILD_SHARED_LIBS=OFF"); - Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString()); - foreach (var file in vorbisBinariesToCopyWindowsCmake) - binariesToCopy.Add((Path.Combine(vorbisBuildDir, "lib", configurationMsvc, file.Item1), file.Item2)); + SetupDirectory(vorbisBuildDir, true); + RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" " + vorbisConfig); + if (platform == TargetPlatform.Windows) + Deploy.VCEnvironment.BuildSolution(Path.Combine(vorbisBuildDir, "vorbis.sln"), _configuration, architecture.ToString()); + else + BuildCmake(vorbisBuildDir); + Utilities.Run("cmake", $"--build . --config {_configuration} --target install", null, vorbisBuildDir, Utilities.RunOptions.DefaultTool); } // Copy binaries + foreach (var file in oggBinariesToCopy) + binariesToCopy.Add((Path.Combine(installDir, "lib", file.Item1), file.Item2)); + foreach (var file in vorbisBinariesToCopy) + binariesToCopy.Add((Path.Combine(installDir, "lib", file.Filename), file.DstFilename ?? file.Filename)); + var depsFolder = GetThirdPartyFolder(options, platform, architecture); foreach (var file in binariesToCopy) Utilities.FileCopy(file.Item1, Path.Combine(depsFolder, file.Item2)); @@ -337,203 +349,140 @@ namespace Flax.Deps.Dependencies foreach (var platform in options.Platforms) { - BuildStarted(platform); - switch (platform) + foreach (var architecture in options.Architectures) { - case TargetPlatform.Windows: - { - BuildCmake(options, TargetPlatform.Windows, TargetArchitecture.x64); - BuildCmake(options, TargetPlatform.Windows, TargetArchitecture.ARM64); - break; - } - case TargetPlatform.UWP: - { - BuildMsbuild(options, TargetPlatform.UWP, TargetArchitecture.x64); - break; - } - case TargetPlatform.XboxOne: - { - BuildMsbuild(options, TargetPlatform.XboxOne, TargetArchitecture.x64); - break; - } - case TargetPlatform.Linux: - { - // Note: assumes the libogg-dev package is pre-installed on the system - - // Get the source - CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); - - var envVars = new Dictionary + BuildStarted(platform, architecture); + switch (platform) { - { "CC", "clang-" + Configuration.LinuxClangMinVer }, - { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer }, - { "CXX", "clang++-" + Configuration.LinuxClangMinVer }, - { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel }, - }; - var buildDir = Path.Combine(root, "build"); - - Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.DefaultTool, envVars); - - // Build for Linux - var toolchain = UnixToolchain.GetToolchainName(platform, TargetArchitecture.x64); - Utilities.Run(Path.Combine(root, "configure"), string.Format("--host={0}", toolchain), null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars); - SetupDirectory(buildDir, true); - Utilities.Run("cmake", "-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars); - Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64); - foreach (var file in binariesToCopyUnix) - Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); - break; - } - case TargetPlatform.PS4: - { - BuildMsbuild(options, TargetPlatform.PS4, TargetArchitecture.x64); - break; - } - case TargetPlatform.PS5: - { - BuildMsbuild(options, TargetPlatform.PS5, TargetArchitecture.x64); - break; - } - case TargetPlatform.XboxScarlett: - { - BuildMsbuild(options, TargetPlatform.XboxScarlett, TargetArchitecture.x64); - break; - } - case TargetPlatform.Android: - { - var oggRoot = Path.Combine(root, "ogg"); - var oggBuildDir = Path.Combine(oggRoot, "build"); - var buildDir = Path.Combine(root, "build"); - - // Get the source - CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); - CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); - GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); - - // Build for Android - SetupDirectory(oggBuildDir, true); - RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); - Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - foreach (var file in binariesToCopyUnix) - Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); - break; - } - case TargetPlatform.Switch: - { - var oggRoot = Path.Combine(root, "ogg"); - var oggBuildDir = Path.Combine(oggRoot, "build"); - var buildDir = Path.Combine(root, "build"); - - // Get the source - SetupDirectory(oggRoot, false); - CloneGitRepo(root, "https://github.com/xiph/vorbis.git"); - GitCheckout(root, "master", "98eddc72d36e3421519d54b101c09b57e4d4d10d"); - CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); - GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg"), oggRoot, true, true); - Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/vorbis"), buildDir, true, true); - - // Build for Switch - SetupDirectory(oggBuildDir, true); - RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); - Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); - Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg", "include", "ogg", "config_types.h"), Path.Combine(oggRoot, "install", "include", "ogg", "config_types.h")); - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - foreach (var file in binariesToCopyUnix) - Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); - break; - } - case TargetPlatform.Mac: - { - var oggRoot = Path.Combine(root, "ogg"); - var oggBuildDir = Path.Combine(oggRoot, "build"); - var buildDir = Path.Combine(root, "build"); - - // Get the source - CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); - CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); - GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); - - // Build for Mac - foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 }) + case TargetPlatform.Windows: { + BuildCmake(options, TargetPlatform.Windows, architecture); + break; + } + case TargetPlatform.UWP: + { + BuildMsbuild(options, TargetPlatform.UWP, architecture); + break; + } + case TargetPlatform.XboxOne: + { + BuildMsbuild(options, TargetPlatform.XboxOne, architecture); + break; + } + case TargetPlatform.Linux: + { + BuildCmake(options, TargetPlatform.Linux, architecture); + break; + } + case TargetPlatform.PS4: + { + BuildMsbuild(options, TargetPlatform.PS4, TargetArchitecture.x64); + break; + } + case TargetPlatform.PS5: + { + BuildMsbuild(options, TargetPlatform.PS5, TargetArchitecture.x64); + break; + } + case TargetPlatform.XboxScarlett: + { + BuildMsbuild(options, TargetPlatform.XboxScarlett, TargetArchitecture.x64); + break; + } + case TargetPlatform.Android: + { + var oggRoot = Path.Combine(root, "ogg"); + var oggBuildDir = Path.Combine(oggRoot, "build"); + var buildDir = Path.Combine(root, "build"); + + // Get the source + CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); + CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); + GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); + + // Build for Android SetupDirectory(oggBuildDir, true); - RunCmake(oggBuildDir, platform, architecture, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); - Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); + RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); + Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, architecture, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); + RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, architecture); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); foreach (var file in binariesToCopyUnix) Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); + break; } - break; - } - case TargetPlatform.iOS: - { - var oggRoot = Path.Combine(root, "ogg"); - var oggBuildDir = Path.Combine(oggRoot, "build"); - var buildDir = Path.Combine(root, "build"); + case TargetPlatform.Switch: + { + var oggRoot = Path.Combine(root, "ogg"); + var oggBuildDir = Path.Combine(oggRoot, "build"); + var buildDir = Path.Combine(root, "build"); - // Get the source - CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); - CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); - GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); + // Get the source + SetupDirectory(oggRoot, false); + CloneGitRepo(root, "https://github.com/xiph/vorbis.git"); + GitCheckout(root, "master", "98eddc72d36e3421519d54b101c09b57e4d4d10d"); + CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); + GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg"), oggRoot, true, true); + Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/vorbis"), buildDir, true, true); - // Build for Mac - SetupDirectory(oggBuildDir, true); - RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); - Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); - SetupDirectory(buildDir, true); - RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); - BuildCmake(buildDir); - var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); - foreach (var file in binariesToCopyUnix) - Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); - break; - } + // Build for Switch + SetupDirectory(oggBuildDir, true); + RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); + Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); + Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg", "include", "ogg", "config_types.h"), Path.Combine(oggRoot, "install", "include", "ogg", "config_types.h")); + SetupDirectory(buildDir, true); + RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); + BuildCmake(buildDir); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + foreach (var file in binariesToCopyUnix) + Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); + break; + } + case TargetPlatform.Mac: + { + BuildCmake(options, TargetPlatform.Mac, architecture); + break; + } + case TargetPlatform.iOS: + { + var oggRoot = Path.Combine(root, "ogg"); + var oggBuildDir = Path.Combine(oggRoot, "build"); + var buildDir = Path.Combine(root, "build"); + + // Get the source + CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git"); + CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git"); + GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5"); + + // Build for Mac + SetupDirectory(oggBuildDir, true); + RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\""); + Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput); + SetupDirectory(buildDir, true); + RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot)); + BuildCmake(buildDir); + var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64); + foreach (var file in binariesToCopyUnix) + Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename)); + break; + } + } } } - // Backup files - if (hasSourcesReady) - root = rootMsvcLib; - var srcIncludePath = Path.Combine(root, "include", "vorbis"); - var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "vorbis"); - foreach (var filename in filesToKeep) - { - var src = Path.Combine(dstIncludePath, filename); - var dst = Path.Combine(options.IntermediateFolder, filename + ".tmp"); - Utilities.FileCopy(src, dst); - } + // Setup headers directory + var installDir = Path.Combine(root, "install"); + var oggOut = Path.Combine(options.ThirdPartyFolder, "ogg"); + var vorbisOut = Path.Combine(options.ThirdPartyFolder, "vorbis"); - try - { - // Setup headers directory - SetupDirectory(dstIncludePath, true); + // Deploy header files + Utilities.DirectoryCopy(Path.Combine(installDir, "include", "ogg"), oggOut, true, true); + Utilities.DirectoryCopy(Path.Combine(installDir, "include", "vorbis"), vorbisOut, true, true); - // Deploy header files and restore files - Directory.GetFiles(srcIncludePath, "Makefile*").ToList().ForEach(File.Delete); - Utilities.DirectoryCopy(srcIncludePath, dstIncludePath, true, true); - Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING")); - } - finally - { - foreach (var filename in filesToKeep) - { - var src = Path.Combine(options.IntermediateFolder, filename + ".tmp"); - var dst = Path.Combine(dstIncludePath, filename); - Utilities.FileCopy(src, dst); - } - } + Utilities.FileCopy(Path.Combine(root, "libogg", "COPYING"), Path.Combine(oggOut, "COPYING")); + Utilities.FileCopy(Path.Combine(root, "libvorbis", "COPYING"), Path.Combine(vorbisOut, "COPYING")); } } } diff --git a/Source/Tools/Flax.Build/Deps/Dependency.cs b/Source/Tools/Flax.Build/Deps/Dependency.cs index 010a45175..7286bf9f3 100644 --- a/Source/Tools/Flax.Build/Deps/Dependency.cs +++ b/Source/Tools/Flax.Build/Deps/Dependency.cs @@ -40,6 +40,11 @@ namespace Flax.Deps /// The target platforms to build dependency for (contains only platforms supported by the dependency itself). /// public TargetPlatform[] Platforms; + + /// + /// The target architectures to build dependency for (contains only platforms supported by the dependency itself). + /// + public TargetArchitecture[] Architectures; } /// @@ -47,7 +52,6 @@ namespace Flax.Deps /// protected static TargetPlatform BuildPlatform => Platform.BuildPlatform.Target; - private static Version? _cmakeVersion; protected static Version CMakeVersion { @@ -55,11 +59,19 @@ namespace Flax.Deps { if (_cmakeVersion == null) { - var versionOutput = Utilities.ReadProcessOutput("cmake", "--version"); - var versionStart = versionOutput.IndexOf("cmake version ") + "cmake version ".Length; - var versionEnd = versionOutput.IndexOfAny(['-', '\n', '\r'], versionStart); // End of line or dash before Git hash - var versionString = versionOutput.Substring(versionStart, versionEnd - versionStart); - _cmakeVersion = new Version(versionString); + try + { + var versionOutput = Utilities.ReadProcessOutput("cmake", "--version"); + var versionStart = versionOutput.IndexOf("cmake version ") + "cmake version ".Length; + var versionEnd = versionOutput.IndexOfAny(['-', '\n', '\r'], versionStart); // End of line or dash before Git hash + var versionString = versionOutput.Substring(versionStart, versionEnd - versionStart); + _cmakeVersion = new Version(versionString); + } + catch (Exception) + { + // Assume old version by default (in case of errors) + _cmakeVersion = new Version(3, 0); + } } return _cmakeVersion; } @@ -68,7 +80,95 @@ namespace Flax.Deps /// /// Gets the platforms list supported by this dependency to build on the current build platform (based on ). /// - public abstract TargetPlatform[] Platforms { get; } + public virtual TargetPlatform[] Platforms + { + get + { + // The most common build setup + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetPlatform.Windows, + TargetPlatform.XboxOne, + TargetPlatform.XboxScarlett, + TargetPlatform.PS4, + TargetPlatform.PS5, + TargetPlatform.Android, + TargetPlatform.Switch, + }; + case TargetPlatform.Linux: + return new[] + { + TargetPlatform.Linux, + }; + case TargetPlatform.Mac: + return new[] + { + TargetPlatform.Mac, + TargetPlatform.iOS, + }; + default: return new TargetPlatform[0]; + } + } + } + + /// + /// Gets the architectures list supported by this dependency to build on the current build platform (based on ). + /// + public virtual TargetArchitecture[] Architectures + { + get + { + // Default value returns all supported architectures for all supported platforms + switch (BuildPlatform) + { + case TargetPlatform.Windows: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.Linux: + return new[] + { + TargetArchitecture.x64, + //TargetArchitecture.ARM64, + }; + case TargetPlatform.Mac: + return new[] + { + TargetArchitecture.x64, + TargetArchitecture.ARM64, + }; + case TargetPlatform.XboxOne: + case TargetPlatform.XboxScarlett: + case TargetPlatform.PS4: + case TargetPlatform.PS5: + return new[] + { + TargetArchitecture.x64, + }; + case TargetPlatform.Switch: + return new[] + { + TargetArchitecture.ARM64, + }; + case TargetPlatform.Android: + return new[] + { + TargetArchitecture.ARM64, + }; + case TargetPlatform.iOS: + return new[] + { + TargetArchitecture.ARM64, + }; + default: return new TargetArchitecture[0]; + } + } + } /// /// True if build dependency by default, otherwise only when explicitly specified via command line. @@ -85,9 +185,9 @@ namespace Flax.Deps /// Logs build process start. /// /// Target platform. - protected void BuildStarted(TargetPlatform platform) + protected void BuildStarted(TargetPlatform platform, TargetArchitecture architecture) { - Log.Info($"Building {GetType().Name} for {platform}"); + Log.Info($"Building {GetType().Name} for {platform}{(architecture != TargetArchitecture.AnyCPU ? $" ({architecture})" : "")}"); } /// diff --git a/Source/Tools/Flax.Build/Deps/DepsBuilder.cs b/Source/Tools/Flax.Build/Deps/DepsBuilder.cs index c43c39ea3..1b8389080 100644 --- a/Source/Tools/Flax.Build/Deps/DepsBuilder.cs +++ b/Source/Tools/Flax.Build/Deps/DepsBuilder.cs @@ -38,20 +38,21 @@ namespace Flax.Deps var platforms = Globals.AllPlatforms; if (Configuration.BuildPlatforms != null && Configuration.BuildPlatforms.Length != 0) platforms = Configuration.BuildPlatforms; - platforms = platforms.Where(x => buildPlatform.CanBuildPlatform(x)).ToArray(); - Log.Verbose("Building deps for platforms:"); + platforms = platforms.Where(buildPlatform.CanBuildPlatform).ToArray(); + var architectures = Globals.AllArchitectures; + if (Configuration.BuildArchitectures != null && Configuration.BuildArchitectures.Length != 0) + architectures = Configuration.BuildArchitectures; + architectures = architectures.Where(buildPlatform.CanBuildArchitecture).ToArray(); + Log.Verbose($"Building deps for platforms {string.Join(',', platforms)}, {string.Join(',', architectures)}:"); foreach (var platform in platforms) { - Log.Verbose(" - " + platform); + foreach (var architecture in architectures) + { + Log.Verbose($" - {platform} ({architecture})"); - if (Platform.IsPlatformSupported(platform, TargetArchitecture.x64)) - SetupDepsOutputFolder(options, platform, TargetArchitecture.x64); - if (Platform.IsPlatformSupported(platform, TargetArchitecture.x86)) - SetupDepsOutputFolder(options, platform, TargetArchitecture.x86); - if (Platform.IsPlatformSupported(platform, TargetArchitecture.ARM)) - SetupDepsOutputFolder(options, platform, TargetArchitecture.ARM); - if (Platform.IsPlatformSupported(platform, TargetArchitecture.ARM64)) - SetupDepsOutputFolder(options, platform, TargetArchitecture.ARM64); + if (Platform.IsPlatformSupported(platform, architecture)) + SetupDepsOutputFolder(options, platform, architecture); + } } // Get all deps @@ -80,6 +81,14 @@ namespace Flax.Deps continue; } + options.Architectures = architectures.Intersect(dependency.Architectures).ToArray(); + if (options.Architectures.Length == 0) + { + Log.Info(string.Format("Skipping {0} ({1}/{2})", name, i + 1, dependencies.Length)); + Log.Verbose("Architecture not used on any of the build platforms."); + continue; + } + Log.Info(string.Format("Building {0} ({1}/{2})", name, i + 1, dependencies.Length)); options.IntermediateFolder = Path.Combine(Environment.CurrentDirectory, "Cache", "Intermediate", "Deps", name).Replace('\\', '/');