diff --git a/Flax.flaxproj b/Flax.flaxproj
index aa3a8655f..ee5ab4547 100644
--- a/Flax.flaxproj
+++ b/Flax.flaxproj
@@ -4,7 +4,7 @@
"Major": 1,
"Minor": 12,
"Revision": 0,
- "Build": 6901
+ "Build": 6902
},
"Company": "Flax",
"Copyright": "Copyright (c) 2012-2026 Wojciech Figat. All rights reserved.",
diff --git a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
index 954599347..9844f3fda 100644
--- a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
+++ b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
@@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated
settingsButton.Tag = script;
settingsButton.Clicked += OnSettingsButtonClicked;
- group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2);
+ // Adjust margin to not overlap with other ui elements in the header
+ group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin };
group.Object(values, editor);
// Remove drop down arrows and containment lines if no objects in the group
if (group.Children.Count == 0)
diff --git a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
index b3fff5644..28593a7f5 100644
--- a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
+++ b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
@@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors
protected bool NotNullItems;
private IntValueBox _sizeBox;
+ private Label _label;
private Color _background;
private int _elementsCount, _minCount, _maxCount;
private bool _readOnly;
@@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors
Parent = dropPanel,
};
- var label = new Label
+ _label = new Label
{
Text = "Size",
AnchorPreset = AnchorPresets.TopRight,
@@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors
Resize(Count + 1);
};
}
- }
+ Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged;
+ }
+
private void OnSetupContextMenu(ContextMenu menu, DropPanel panel)
{
if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal)))
@@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors
});
}
+ private void OnLayoutSizeChanged(Control control)
+ {
+ if (Layout.ContainerControl is DropPanel dropPanel)
+ {
+ // Hide "Size" text when array editor title overlaps
+ var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText);
+ if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left)
+ _label.TextColor = _label.TextColorHighlighted = Color.Transparent;
+ else
+ _label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground;
+ }
+ }
+
///
protected override void Deinitialize()
{
_sizeBox = null;
+ Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged;
base.Deinitialize();
}
diff --git a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
index 64bc9080b..055c6a29d 100644
--- a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
+++ b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
@@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements
{
var style = Style.Current;
var settingsButtonSize = Panel.HeaderHeight;
- return new Image
+ Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin };
+; return new Image
{
TooltipText = "Settings",
AutoFocus = true,
diff --git a/Source/Editor/GUI/Input/ValueBox.cs b/Source/Editor/GUI/Input/ValueBox.cs
index 674ee0697..88ec9a4ee 100644
--- a/Source/Editor/GUI/Input/ValueBox.cs
+++ b/Source/Editor/GUI/Input/ValueBox.cs
@@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input
///
public event Action SlidingEnd;
+ ///
+ /// If enabled, pressing the arrow up or down key increments/ decrements the value.
+ ///
+ public bool ArrowKeysIncrement = true;
+
///
/// Gets or sets the slider speed. Use value 0 to disable and hide slider UI.
///
@@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input
ResetViewOffset();
}
+ ///
+ public override bool OnKeyDown(KeyboardKeys key)
+ {
+ if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown))
+ {
+ bool altDown = Root.GetKey(KeyboardKeys.Alt);
+ bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
+ bool controlDown = Root.GetKey(KeyboardKeys.Control);
+ float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f));
+ float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
+
+ _startSlideValue = Value;
+ ApplySliding(slideDelta);
+ EndSliding();
+ Focus();
+ return true;
+ }
+
+ return base.OnKeyDown(key);
+ }
+
///
public override bool OnMouseDown(Float2 location, MouseButton button)
{
diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs
index 66e83a6e9..7cd7e7fef 100644
--- a/Source/Editor/Modules/UIModule.cs
+++ b/Source/Editor/Modules/UIModule.cs
@@ -133,6 +133,7 @@ namespace FlaxEditor.Modules
private ContextMenuButton _menuToolsProfilerWindow;
private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault;
private ContextMenuButton _menuToolsTakeScreenshot;
+ private ContextMenuButton _menuToolsOpenLocalFolder;
private ContextMenuChildMenu _menuWindowApplyWindowLayout;
private ToolStripButton _toolStripSaveAll;
@@ -754,6 +755,16 @@ namespace FlaxEditor.Modules
_menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot);
cm.AddSeparator();
cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
+ cm.AddSeparator();
+ var childMenu = cm.AddChildMenu("Open Product Local folder");
+ childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
+ _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () =>
+ {
+ string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+ GameSettings settings = GameSettings.Load();
+ string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+ FileSystem.ShowFileExplorer(path);
+ });
// Window
MenuWindow = MainMenu.AddButton("Window");
@@ -1091,6 +1102,10 @@ namespace FlaxEditor.Modules
_menuToolsBuildNavMesh.Enabled = canEdit;
_menuToolsCancelBuilding.Enabled = GameCooker.IsRunning;
_menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0;
+ string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+ GameSettings settings = GameSettings.Load();
+ string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+ _menuToolsOpenLocalFolder.Enabled = Directory.Exists(path);
c.PerformLayout();
}
diff --git a/Source/Editor/Options/InputOptions.cs b/Source/Editor/Options/InputOptions.cs
index ab473ebed..a759b7247 100644
--- a/Source/Editor/Options/InputOptions.cs
+++ b/Source/Editor/Options/InputOptions.cs
@@ -571,6 +571,10 @@ namespace FlaxEditor.Options
[EditorDisplay("View Flags"), EditorOrder(3260)]
public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift);
+ [DefaultValue(typeof(InputBinding), "None")]
+ [EditorDisplay("View Flags"), EditorOrder(3270)]
+ public InputBinding Particles = new InputBinding(KeyboardKeys.None);
+
#endregion
#region Interface
diff --git a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
index c4fd47f71..4a7150972 100644
--- a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
+++ b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
@@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors
if (value is BoxCollider collider)
collider.AutoResize(!_keepLocalOrientation);
}
+ Presenter.OnModified();
}
}
diff --git a/Source/Editor/Utilities/ShuntingYardParser.cs b/Source/Editor/Utilities/ShuntingYardParser.cs
index 47e2275e5..fe473389c 100644
--- a/Source/Editor/Utilities/ShuntingYardParser.cs
+++ b/Source/Editor/Utilities/ShuntingYardParser.cs
@@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities
/// The result value.
public static double Parse(string text)
{
+ // Hack to allow parsing numbers while using "_" as a separator (like this: 1_000)
+ text = text.Replace("_", string.Empty);
+
var tokens = Tokenize(text);
var rpn = OrderTokens(tokens);
return EvaluateRPN(rpn);
diff --git a/Source/Editor/Viewport/EditorViewport.cs b/Source/Editor/Viewport/EditorViewport.cs
index c16d3d9f5..2af065c68 100644
--- a/Source/Editor/Viewport/EditorViewport.cs
+++ b/Source/Editor/Viewport/EditorViewport.cs
@@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport
InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog);
InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight);
InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals);
+ InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles);
InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess);
InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom);
InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping);
@@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport
new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog),
new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight),
new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals),
+ new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles),
new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess),
new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom),
new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping),
@@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport
if (cm.Visible == false)
return;
var ccm = (ContextMenu)cm;
+ var flags = Task.View.Flags;
foreach (var e in ccm.Items)
{
if (e is ContextMenuButton b && b.Tag != null)
{
var v = (ViewFlags)b.Tag;
- b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
+ b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
}
}
}
diff --git a/Source/Editor/Windows/EditorOptionsWindow.cs b/Source/Editor/Windows/EditorOptionsWindow.cs
index 0ee9a92d7..c6bf2fd16 100644
--- a/Source/Editor/Windows/EditorOptionsWindow.cs
+++ b/Source/Editor/Windows/EditorOptionsWindow.cs
@@ -45,7 +45,7 @@ namespace FlaxEditor.Windows
{
Parent = this
};
- _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save");
+ _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save.");
_saveButton.Enabled = false;
_tabs = new Tabs
@@ -104,6 +104,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = true;
_isDataDirty = true;
+ if (!Title.EndsWith('*'))
+ Title += "*";
}
}
@@ -113,6 +115,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = false;
_isDataDirty = false;
+ if (Title.EndsWith('*'))
+ Title = Title.Remove(Title.Length - 1);
}
}
diff --git a/Source/Engine/Content/Assets/Material.cpp b/Source/Engine/Content/Assets/Material.cpp
index 019fd9dd8..b4cf55d4d 100644
--- a/Source/Engine/Content/Assets/Material.cpp
+++ b/Source/Engine/Content/Assets/Material.cpp
@@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const
return false;
}
+#if USE_EDITOR
+
+void Material::GetReferences(Array& assets, Array& files) const
+{
+ ShaderAssetTypeBase::GetReferences(assets, files);
+
+ // Collect references from material graph (needs to load it)
+ if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE))
+ {
+ ScopeLock lock(Locker);
+ if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE)))
+ {
+ const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE);
+ if (surfaceChunk)
+ {
+ MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size());
+ MaterialGraph graph;
+ if (!graph.Load(&stream, false))
+ {
+ graph.GetReferences(assets);
+ }
+ }
+ }
+ }
+
+}
+
+#endif
+
const MaterialInfo& Material::GetInfo() const
{
if (_materialShader)
diff --git a/Source/Engine/Content/Assets/Material.h b/Source/Engine/Content/Assets/Material.h
index 4ce47b154..cd2ae8e97 100644
--- a/Source/Engine/Content/Assets/Material.h
+++ b/Source/Engine/Content/Assets/Material.h
@@ -38,6 +38,9 @@ public:
public:
// [MaterialBase]
bool IsMaterialInstance() const override;
+#if USE_EDITOR
+ void GetReferences(Array& assets, Array& files) const override;
+#endif
// [IMaterial]
const MaterialInfo& GetInfo() const override;
diff --git a/Source/Engine/Debug/DebugDraw.cpp b/Source/Engine/Debug/DebugDraw.cpp
index 7c798f88f..bea9e76f4 100644
--- a/Source/Engine/Debug/DebugDraw.cpp
+++ b/Source/Engine/Debug/DebugDraw.cpp
@@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de
return list->Get() + startIndex;
}
+FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest)
+{
+ Array* list;
+ if (depthTest)
+ list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles;
+ else
+ list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles;
+ const int32 startIndex = list->Count();
+ list->AddUninitialized(count);
+ return list->Get() + startIndex;
+}
+
inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer)
{
Matrix w, fw, m;
@@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& col
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
- auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+ auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Float3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin;
for (int32 i = 0; i < indices.Length();)
{
@@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& co
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
- auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+ auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Double3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin;
for (int32 i = 0; i < indices.Length();)
{
diff --git a/Source/Engine/Graphics/Enums.h b/Source/Engine/Graphics/Enums.h
index f6af6c16b..107fe3533 100644
--- a/Source/Engine/Graphics/Enums.h
+++ b/Source/Engine/Graphics/Enums.h
@@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64
///
LightsDebug = 1 << 27,
+ ///
+ /// Shows/hides particle effects.
+ ///
+ Particles = 1 << 28,
+
///
/// Default flags for Game.
///
- DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky,
+ DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles,
///
/// Default flags for Editor.
///
- DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky,
+ DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles,
///
/// Default flags for materials/models previews generating.
///
- DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky,
+ DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles,
};
DECLARE_ENUM_OPERATORS(ViewFlags);
diff --git a/Source/Engine/Input/Input.cpp b/Source/Engine/Input/Input.cpp
index 8438977b1..7048140ef 100644
--- a/Source/Engine/Input/Input.cpp
+++ b/Source/Engine/Input/Input.cpp
@@ -80,6 +80,8 @@ Delegate Input::MouseDoubleClick;
Delegate Input::MouseWheel;
Delegate Input::MouseMove;
Action Input::MouseLeave;
+Delegate Input::GamepadButtonDown;
+Delegate Input::GamepadButtonUp;
Delegate Input::TouchDown;
Delegate Input::TouchMove;
Delegate Input::TouchUp;
@@ -1027,6 +1029,19 @@ void InputService::Update()
break;
}
}
+ // TODO: route gamepad button events into global InputEvents queue to improve processing
+ for (int32 i = 0; i < Input::Gamepads.Count(); i++)
+ {
+ auto gamepad = Input::Gamepads[i];
+ for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++)
+ {
+ GamepadButton button = (GamepadButton)buttonIdx;
+ if (gamepad->GetButtonDown(button))
+ Input::GamepadButtonDown((InputGamepadIndex)i, button);
+ else if (gamepad->GetButtonUp(button))
+ Input::GamepadButtonUp((InputGamepadIndex)i, button);
+ }
+ }
// Update all actions
for (int32 i = 0; i < Input::ActionMappings.Count(); i++)
diff --git a/Source/Engine/Input/Input.h b/Source/Engine/Input/Input.h
index 8cc1b2106..73e87f5f0 100644
--- a/Source/Engine/Input/Input.h
+++ b/Source/Engine/Input/Input.h
@@ -113,6 +113,16 @@ public:
///
API_EVENT() static Action MouseLeave;
+ ///
+ /// Event fired when gamepad button goes down.
+ ///
+ API_EVENT() static Delegate GamepadButtonDown;
+
+ ///
+ /// Event fired when gamepad button goes up.
+ ///
+ API_EVENT() static Delegate GamepadButtonUp;
+
///
/// Event fired when touch action begins.
///
diff --git a/Source/Engine/Particles/ParticleEffect.cpp b/Source/Engine/Particles/ParticleEffect.cpp
index 6e94594b0..9592147a7 100644
--- a/Source/Engine/Particles/ParticleEffect.cpp
+++ b/Source/Engine/Particles/ParticleEffect.cpp
@@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const
void ParticleEffect::Draw(RenderContext& renderContext)
{
- if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas)
+ if (renderContext.View.Pass == DrawPass::GlobalSDF ||
+ renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas ||
+ EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles))
return;
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition));
RenderContextBatch renderContextBatch(renderContext);
@@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext)
void ParticleEffect::Draw(RenderContextBatch& renderContextBatch)
{
+ const RenderView& mainView = renderContextBatch.GetMainContext().View;
+ if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles))
+ return;
Particles::DrawParticles(renderContextBatch, this);
// Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections
- const RenderView& mainView = renderContextBatch.GetMainContext().View;
const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius);
if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds))
return;
diff --git a/Source/Engine/Physics/Colliders/BoxCollider.cpp b/Source/Engine/Physics/Colliders/BoxCollider.cpp
index 1e90cb91f..47e551b37 100644
--- a/Source/Engine/Physics/Colliders/BoxCollider.cpp
+++ b/Source/Engine/Physics/Colliders/BoxCollider.cpp
@@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value)
void BoxCollider::AutoResize(bool globalOrientation = true)
{
Actor* parent = GetParent();
- if (Cast(parent))
+ if (parent == nullptr || Cast(parent))
return;
// Get bounds of all siblings (excluding itself)
const Vector3 parentScale = parent->GetScale();
if (parentScale.IsAnyZero())
- return; // Avoid division by zero
+ return;
- // Hacky way to get unrotated bounded box of parent.
+ // Hacky way to get unrotated bounded box of parent
const Quaternion parentOrientation = parent->GetOrientation();
parent->SetOrientation(Quaternion::Identity);
BoundingBox parentBox = parent->GetBox();
diff --git a/Source/Engine/Scripting/Scripting.cs b/Source/Engine/Scripting/Scripting.cs
index 7f9f2980c..229e411f3 100644
--- a/Source/Engine/Scripting/Scripting.cs
+++ b/Source/Engine/Scripting/Scripting.cs
@@ -137,8 +137,8 @@ namespace FlaxEngine
{
Debug.LogError($"Unhandled Exception: {exception.Message}");
Debug.LogException(exception);
- if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
- Platform.Fatal($"Unhandled Exception: {exception}");
+ //if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
+ // Platform.Fatal($"Unhandled Exception: {exception}");
}
}
diff --git a/Source/Engine/UI/GUI/Panels/DropPanel.cs b/Source/Engine/UI/GUI/Panels/DropPanel.cs
index de80f9fc5..308272218 100644
--- a/Source/Engine/UI/GUI/Panels/DropPanel.cs
+++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs
@@ -11,6 +11,11 @@ namespace FlaxEngine.GUI
[ActorToolbox("GUI")]
public class DropPanel : ContainerControl
{
+ ///
+ /// Size of the drop down icon.
+ ///
+ public const float DropDownIconSize = 14.0f;
+
///
/// The header height.
///
@@ -368,7 +373,7 @@ namespace FlaxEngine.GUI
var style = Style.Current;
var enabled = EnabledInHierarchy;
- // Paint Background
+ // Draw Background
var backgroundColor = BackgroundColor;
if (backgroundColor.A > 0.0f)
{
@@ -386,7 +391,7 @@ namespace FlaxEngine.GUI
float textLeft = 0;
if (EnableDropDownIcon)
{
- textLeft += 14;
+ textLeft += DropDownIconSize;
var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12);
var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey;
if (_isClosed)
@@ -395,7 +400,7 @@ namespace FlaxEngine.GUI
ArrowImageOpened?.Draw(dropDownRect, arrowColor);
}
- // Text
+ // Header text
var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight);
_headerTextMargin.ShrinkRectangle(ref textRect);
var textColor = HeaderTextColor;
@@ -404,7 +409,9 @@ namespace FlaxEngine.GUI
textColor *= 0.6f;
}
+ Render2D.PushClip(textRect);
Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center);
+ Render2D.PopClip();
if (!_isClosed && EnableContainmentLines)
{
diff --git a/Source/Shaders/GI/DDGI.hlsl b/Source/Shaders/GI/DDGI.hlsl
index 3e31c2e53..b88b846a6 100644
--- a/Source/Shaders/GI/DDGI.hlsl
+++ b/Source/Shaders/GI/DDGI.hlsl
@@ -305,6 +305,8 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, T
uint cascadeIndex = DDGI_DEBUG_CASCADE;
#else
uint cascadeIndex = 0;
+ if (data.CascadesCount == 0)
+ return float3(0, 0, 0);
for (; cascadeIndex < data.CascadesCount; cascadeIndex++)
{
// Get cascade data
diff --git a/Source/ThirdParty/meshoptimizer/allocator.cpp b/Source/ThirdParty/meshoptimizer/allocator.cpp
index 12eda3872..6b6083da2 100644
--- a/Source/ThirdParty/meshoptimizer/allocator.cpp
+++ b/Source/ThirdParty/meshoptimizer/allocator.cpp
@@ -1,8 +1,17 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
-void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+meshopt_Allocator::Storage& meshopt_Allocator::storage()
{
- meshopt_Allocator::Storage::allocate = allocate;
- meshopt_Allocator::Storage::deallocate = deallocate;
+ static Storage s = {::operator new, ::operator delete };
+ return s;
+}
+#endif
+
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+{
+ meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
+ s.allocate = allocate;
+ s.deallocate = deallocate;
}
diff --git a/Source/ThirdParty/meshoptimizer/clusterizer.cpp b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
index 52fe5a362..73cc0ab53 100644
--- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
@@ -6,19 +6,39 @@
#include
#include
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
+#define SIMD_SSE
+#include
+#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
+#define SIMD_NEON
+#include
+#endif
+#endif // !MESHOPTIMIZER_NO_SIMD
+
// This work is based on:
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
// Jack Ritter. An Efficient Bounding Sphere. 1990
+// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
+// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
namespace meshopt
{
-// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
-const size_t kMeshletMaxVertices = 255;
+// This must be <= 256 since meshlet indices are stored as bytes
+const size_t kMeshletMaxVertices = 256;
// A reasonable limit is around 2*max_vertices or less
const size_t kMeshletMaxTriangles = 512;
+// We keep a limited number of seed triangles and add a few triangles per finished meshlet
+const size_t kMeshletMaxSeeds = 256;
+const size_t kMeshletAddSeeds = 4;
+
+// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
+const int kMeshletMaxTreeDepth = 50;
+
struct TriangleAdjacency2
{
unsigned int* counts;
@@ -70,72 +90,190 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned
for (size_t i = 0; i < vertex_count; ++i)
{
assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
adjacency.offsets[i] -= adjacency.counts[i];
}
}
-static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
{
- assert(count > 0);
+ size_t face_count = index_count / 3;
- // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
- size_t pmin[3] = {0, 0, 0};
- size_t pmax[3] = {0, 0, 0};
+ // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
+ const unsigned int sparse_seen = 1u << 31;
+ assert(index_count < sparse_seen);
+
+ // allocate arrays
+ adjacency.counts = allocator.allocate(vertex_count);
+ adjacency.offsets = allocator.allocate(vertex_count);
+ adjacency.data = allocator.allocate(index_count);
+
+ // fill triangle counts
+ for (size_t i = 0; i < index_count; ++i)
+ assert(indices[i] < vertex_count);
+
+ for (size_t i = 0; i < index_count; ++i)
+ adjacency.counts[indices[i]] = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ adjacency.counts[indices[i]]++;
+
+ // fill offset table; uses sparse_seen bit to tag visited vertices
+ unsigned int offset = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int v = indices[i];
+
+ if ((adjacency.counts[v] & sparse_seen) == 0)
+ {
+ adjacency.offsets[v] = offset;
+ offset += adjacency.counts[v];
+ adjacency.counts[v] |= sparse_seen;
+ }
+ }
+
+ assert(offset == index_count);
+
+ // fill triangle data
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+ adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+ }
+
+ // fix offsets that have been disturbed by the previous pass
+ // also fix counts (that were marked with sparse_seen by the first pass)
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int v = indices[i];
+
+ if (adjacency.counts[v] & sparse_seen)
+ {
+ adjacency.counts[v] &= ~sparse_seen;
+
+ assert(adjacency.offsets[v] >= adjacency.counts[v]);
+ adjacency.offsets[v] -= adjacency.counts[v];
+ }
+ }
+}
+
+static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
+{
+ // for sparse inputs, it's faster to only clear vertices referenced by the index buffer
+ if (vertex_count <= index_count)
+ memset(used, -1, vertex_count * sizeof(short));
+ else
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ assert(indices[i] < vertex_count);
+ used[indices[i]] = -1;
+ }
+}
+
+static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
+{
+ static const float kAxes[7][3] = {
+ // X, Y, Z
+ {1, 0, 0},
+ {0, 1, 0},
+ {0, 0, 1},
+
+ // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
+ {0.57735026f, 0.57735026f, 0.57735026f},
+ {-0.57735026f, 0.57735026f, 0.57735026f},
+ {0.57735026f, -0.57735026f, 0.57735026f},
+ {0.57735026f, 0.57735026f, -0.57735026f},
+ };
+
+ assert(count > 0);
+ assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
+
+ size_t points_stride_float = points_stride / sizeof(float);
+ size_t radii_stride_float = radii_stride / sizeof(float);
+
+ // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
+ size_t pmin[7], pmax[7];
+ float tmin[7], tmax[7];
+
+ for (size_t axis = 0; axis < axis_count; ++axis)
+ {
+ pmin[axis] = pmax[axis] = 0;
+ tmin[axis] = FLT_MAX;
+ tmax[axis] = -FLT_MAX;
+ }
for (size_t i = 0; i < count; ++i)
{
- const float* p = points[i];
+ const float* p = points + i * points_stride_float;
+ float r = radii[i * radii_stride_float];
- for (int axis = 0; axis < 3; ++axis)
+ for (size_t axis = 0; axis < axis_count; ++axis)
{
- pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
- pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+ const float* ax = kAxes[axis];
+
+ float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
+ float tpmin = tp - r, tpmax = tp + r;
+
+ pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
+ pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
+ tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
+ tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
}
}
// find the pair of points with largest distance
- float paxisd2 = 0;
- int paxis = 0;
+ size_t paxis = 0;
+ float paxisdr = 0;
- for (int axis = 0; axis < 3; ++axis)
+ for (size_t axis = 0; axis < axis_count; ++axis)
{
- const float* p1 = points[pmin[axis]];
- const float* p2 = points[pmax[axis]];
+ const float* p1 = points + pmin[axis] * points_stride_float;
+ const float* p2 = points + pmax[axis] * points_stride_float;
+ float r1 = radii[pmin[axis] * radii_stride_float];
+ float r2 = radii[pmax[axis] * radii_stride_float];
float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+ float dr = sqrtf(d2) + r1 + r2;
- if (d2 > paxisd2)
+ if (dr > paxisdr)
{
- paxisd2 = d2;
+ paxisdr = dr;
paxis = axis;
}
}
// use the longest segment as the initial sphere diameter
- const float* p1 = points[pmin[paxis]];
- const float* p2 = points[pmax[paxis]];
+ const float* p1 = points + pmin[paxis] * points_stride_float;
+ const float* p2 = points + pmax[paxis] * points_stride_float;
+ float r1 = radii[pmin[paxis] * radii_stride_float];
+ float r2 = radii[pmax[paxis] * radii_stride_float];
- float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
- float radius = sqrtf(paxisd2) / 2;
+ float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
+ float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
+
+ float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
+ float radius = paxisdr / 2;
// iteratively adjust the sphere up until all points fit
for (size_t i = 0; i < count; ++i)
{
- const float* p = points[i];
+ const float* p = points + i * points_stride_float;
+ float r = radii[i * radii_stride_float];
+
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+ float d = sqrtf(d2);
- if (d2 > radius * radius)
+ if (d + r > radius)
{
- float d = sqrtf(d2);
- assert(d > 0);
+ float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
- float k = 0.5f + (radius / d) / 2;
-
- center[0] = center[0] * k + p[0] * (1 - k);
- center[1] = center[1] * k + p[1] * (1 - k);
- center[2] = center[2] * k + p[2] * (1 - k);
- radius = (radius + d) / 2;
+ center[0] += k * (p[0] - center[0]);
+ center[1] += k * (p[1] - center[1]);
+ center[2] += k * (p[2] - center[2]);
+ radius = (radius + d + r) / 2;
}
}
@@ -151,12 +289,12 @@ struct Cone
float nx, ny, nz;
};
-static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
{
float cone = 1.f - spread * cone_weight;
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
- return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+ return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
}
static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
@@ -221,72 +359,61 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices,
return mesh_area;
}
-static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
{
- size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
-
- // fill 4b padding with 0
- while (offset & 3)
- meshlet_triangles[offset++] = 0;
-}
-
-static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
-{
- unsigned char& av = used[a];
- unsigned char& bv = used[b];
- unsigned char& cv = used[c];
+ short& av = used[a];
+ short& bv = used[b];
+ short& cv = used[c];
bool result = false;
- unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+ int used_extra = (av < 0) + (bv < 0) + (cv < 0);
- if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+ if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
{
meshlets[meshlet_offset] = meshlet;
for (size_t j = 0; j < meshlet.vertex_count; ++j)
- used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
-
- finishMeshlet(meshlet, meshlet_triangles);
+ used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
meshlet.vertex_offset += meshlet.vertex_count;
- meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+ meshlet.triangle_offset += meshlet.triangle_count * 3;
meshlet.vertex_count = 0;
meshlet.triangle_count = 0;
result = true;
}
- if (av == 0xff)
+ if (av < 0)
{
- av = (unsigned char)meshlet.vertex_count;
+ av = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
}
- if (bv == 0xff)
+ if (bv < 0)
{
- bv = (unsigned char)meshlet.vertex_count;
+ bv = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
}
- if (cv == 0xff)
+ if (cv < 0)
{
- cv = (unsigned char)meshlet.vertex_count;
+ cv = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
}
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
meshlet.triangle_count++;
return result;
}
-static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
{
unsigned int best_triangle = ~0u;
- unsigned int best_extra = 5;
+ int best_priority = 5;
float best_score = FLT_MAX;
for (size_t i = 0; i < meshlet.vertex_count; ++i)
@@ -301,61 +428,159 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
unsigned int triangle = neighbors[j];
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
- unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+ int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+ assert(extra <= 2);
+
+ int priority = -1;
// triangles that don't add new vertices to meshlets are max. priority
- if (extra != 0)
- {
- // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
- if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
- extra = 0;
-
- extra++;
- }
+ if (extra == 0)
+ priority = 0;
+ // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+ else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+ priority = 1;
+ // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
+ else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
+ priority = 1 + extra;
+ // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
+ else
+ priority = 2 + extra;
// since topology-based priority is always more important than the score, we can skip scoring in some cases
- if (extra > best_extra)
+ if (priority > best_priority)
continue;
- float score = 0;
+ const Cone& tri_cone = triangles[triangle];
- // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
- if (meshlet_cone)
- {
- const Cone& tri_cone = triangles[triangle];
+ float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
+ float distance = sqrtf(dx * dx + dy * dy + dz * dz);
+ float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
- float distance2 =
- (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
- (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
- (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
-
- float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
-
- score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
- }
- else
- {
- // each live_triangles entry is >= 1 since it includes the current triangle we're processing
- score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
- }
+ float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
// note that topology-based priority is always more important than the score
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
- if (extra < best_extra || score < best_score)
+ if (priority < best_priority || score < best_score)
{
best_triangle = triangle;
- best_extra = extra;
+ best_priority = priority;
best_score = score;
}
}
}
- if (out_extra)
- *out_extra = best_extra;
-
return best_triangle;
}
+static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+ unsigned int best_seeds[kMeshletAddSeeds];
+ unsigned int best_live[kMeshletAddSeeds];
+ float best_score[kMeshletAddSeeds];
+
+ for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+ {
+ best_seeds[i] = ~0u;
+ best_live[i] = ~0u;
+ best_score[i] = FLT_MAX;
+ }
+
+ for (size_t i = 0; i < meshlet.vertex_count; ++i)
+ {
+ unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+ unsigned int best_neighbor = ~0u;
+ unsigned int best_neighbor_live = ~0u;
+
+ // find the neighbor with the smallest live metric
+ unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+ size_t neighbors_size = adjacency.counts[index];
+
+ for (size_t j = 0; j < neighbors_size; ++j)
+ {
+ unsigned int triangle = neighbors[j];
+ unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+ unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+
+ if (live < best_neighbor_live)
+ {
+ best_neighbor = triangle;
+ best_neighbor_live = live;
+ }
+ }
+
+ // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
+ if (best_neighbor == ~0u)
+ continue;
+
+ float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
+ float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ for (size_t j = 0; j < kMeshletAddSeeds; ++j)
+ {
+ // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
+ if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
+ {
+ best_seeds[j] = best_neighbor;
+ best_live[j] = best_neighbor_live;
+ best_score[j] = best_neighbor_score;
+ break;
+ }
+ }
+ }
+
+ // add surviving seeds to the meshlet
+ size_t seed_count = 0;
+
+ for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+ if (best_seeds[i] != ~0u)
+ seeds[seed_count++] = best_seeds[i];
+
+ return seed_count;
+}
+
+static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
+{
+ size_t result = 0;
+
+ for (size_t i = 0; i < seed_count; ++i)
+ {
+ unsigned int index = seeds[i];
+
+ seeds[result] = index;
+ result += emitted_flags[index] == 0;
+ }
+
+ return result;
+}
+
+static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+ unsigned int best_seed = ~0u;
+ unsigned int best_live = ~0u;
+ float best_score = FLT_MAX;
+
+ for (size_t i = 0; i < seed_count; ++i)
+ {
+ unsigned int index = seeds[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+ float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
+ float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (live < best_live || (live == best_live && score < best_score))
+ {
+ best_seed = index;
+ best_live = live;
+ best_score = score;
+ }
+ }
+
+ return best_seed;
+}
+
struct KDNode
{
union
@@ -364,13 +589,13 @@ struct KDNode
unsigned int index;
};
- // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+ // leaves: axis = 3, children = number of points including this one
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
unsigned int axis : 2;
unsigned int children : 30;
};
-static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
{
size_t m = 0;
@@ -400,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
result.index = indices[0];
result.axis = 3;
- result.children = unsigned(count - 1);
+ result.children = unsigned(count);
// all remaining points are stored in nodes immediately following the leaf
for (size_t i = 1; i < count; ++i)
@@ -415,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
return offset + count;
}
-static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
{
assert(count > 0);
assert(offset < node_count);
@@ -441,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
}
// split axis is one where the variance is largest
- unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+ int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
float split = mean[axis];
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
// when the partition is degenerate simply consolidate the points into a single node
- if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+ // this also ensures recursion depth is bounded on pathological inputs
+ if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
KDNode& result = nodes[offset];
@@ -456,35 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
result.axis = axis;
// left subtree is right after our node
- size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+ size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
// distance to the right subtree is represented explicitly
+ assert(next_offset - offset > 1);
result.children = unsigned(next_offset - offset - 1);
- return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+ return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
}
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
{
const KDNode& node = nodes[root];
+ if (node.children == 0)
+ return;
+
if (node.axis == 3)
{
// leaf
- for (unsigned int i = 0; i <= node.children; ++i)
+ bool inactive = true;
+
+ for (unsigned int i = 0; i < node.children; ++i)
{
unsigned int index = nodes[root + i].index;
if (emitted_flags[index])
continue;
+ inactive = false;
+
const float* point = points + index * stride;
- float distance2 =
- (point[0] - position[0]) * (point[0] - position[0]) +
- (point[1] - position[1]) * (point[1] - position[1]) +
- (point[2] - position[2]) * (point[2] - position[2]);
- float distance = sqrtf(distance2);
+ float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
+ float distance = sqrtf(dx * dx + dy * dy + dz * dz);
if (distance < limit)
{
@@ -492,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
limit = distance;
}
}
+
+ // deactivate leaves that no longer have items to emit
+ if (inactive)
+ nodes[root].children = 0;
}
else
{
@@ -500,6 +735,12 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
unsigned int first = (delta <= 0) ? 0 : node.children;
unsigned int second = first ^ node.children;
+ // deactivate branches that no longer have items to emit to accelerate traversal
+ // note that we do this *before* recursing which delays deactivation but keeps tail calls
+ if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
+ nodes[root].children = 0;
+
+ // recursion depth is bounded by tree depth (which is limited by construction)
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
// only process the other node if it can have a match based on closest distance so far
@@ -508,6 +749,380 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
}
}
+struct BVHBoxT
+{
+ float min[4];
+ float max[4];
+};
+
+struct BVHBox
+{
+ float min[3];
+ float max[3];
+};
+
+#if defined(SIMD_SSE)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ __m128 min = _mm_loadu_ps(box.min);
+ __m128 max = _mm_loadu_ps(box.max);
+
+ // note: over-read is safe because BVHBox array is allocated with padding
+ min = _mm_min_ps(min, _mm_loadu_ps(other.min));
+ max = _mm_max_ps(max, _mm_loadu_ps(other.max));
+
+ _mm_storeu_ps(box.min, min);
+ _mm_storeu_ps(box.max, max);
+
+ __m128 size = _mm_sub_ps(max, min);
+ __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
+ __m128 mul = _mm_mul_ps(size, size_yzx);
+ __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
+ __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
+
+ return _mm_cvtss_f32(sum_xyz);
+}
+#elif defined(SIMD_NEON)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ float32x4_t min = vld1q_f32(box.min);
+ float32x4_t max = vld1q_f32(box.max);
+
+ // note: over-read is safe because BVHBox array is allocated with padding
+ min = vminq_f32(min, vld1q_f32(other.min));
+ max = vmaxq_f32(max, vld1q_f32(other.max));
+
+ vst1q_f32(box.min, min);
+ vst1q_f32(box.max, max);
+
+ float32x4_t size = vsubq_f32(max, min);
+ float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
+ float32x4_t mul = vmulq_f32(size, size_yzx);
+ float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
+ float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
+
+ return sum_xyz;
+}
+#else
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ for (int k = 0; k < 3; ++k)
+ {
+ box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
+ box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
+ }
+
+ float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
+ return sx * sy + sx * sz + sy * sz;
+}
+#endif
+
+inline unsigned int radixFloat(unsigned int v)
+{
+ // if sign bit is 0, flip sign bit
+ // if sign bit is 1, flip everything
+ unsigned int mask = (int(v) >> 31) | 0x80000000;
+ return v ^ mask;
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
+{
+ memset(hist, 0, sizeof(hist));
+
+ const unsigned int* bits = reinterpret_cast(data);
+
+ // compute 3 10-bit histograms in parallel (dropping 2 LSB)
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = radixFloat(bits[i]);
+
+ hist[(id >> 2) & 1023][0]++;
+ hist[(id >> 12) & 1023][1]++;
+ hist[(id >> 22) & 1023][2]++;
+ }
+
+ unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
+
+ // replace histogram data with prefix histogram sums in-place
+ for (int i = 0; i < 1024; ++i)
+ {
+ unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+ hist[i][0] = sum0;
+ hist[i][1] = sum1;
+ hist[i][2] = sum2;
+
+ sum0 += hx;
+ sum1 += hy;
+ sum2 += hz;
+ }
+
+ assert(sum0 == count && sum1 == count && sum2 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+ const unsigned int* bits = reinterpret_cast(keys);
+ int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
+
+ destination[hist[id][pass]++] = source[i];
+ }
+}
+
+static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
+{
+ (void)vertex_count;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ const float* va = vertex_positions + vertex_stride_float * a;
+ const float* vb = vertex_positions + vertex_stride_float * b;
+ const float* vc = vertex_positions + vertex_stride_float * c;
+
+ BVHBox& box = boxes[i];
+
+ for (int k = 0; k < 3; ++k)
+ {
+ box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
+ box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
+
+ box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
+ box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
+
+ centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
+ }
+ }
+}
+
+static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
+{
+ // count number of unique vertices
+ size_t used_vertices = 0;
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int index = order[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+ used[a] = used[b] = used[c] = 1;
+
+ if (out)
+ out[i] = unsigned(used_vertices);
+ }
+
+ // reset used[] for future invocations
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int index = order[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ used[a] = used[b] = used[c] = -1;
+ }
+
+ return used_vertices;
+}
+
+static void bvhPackLeaf(unsigned char* boundary, size_t count)
+{
+ // mark meshlet boundary for future reassembly
+ assert(count > 0);
+
+ boundary[0] = 1;
+ memset(boundary + 1, 0, count - 1);
+}
+
+static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
+{
+ for (size_t i = 0; i < count;)
+ {
+ size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
+
+ if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
+ {
+ bvhPackLeaf(boundary + i, chunk);
+ i += chunk;
+ continue;
+ }
+
+ // chunk is vertex bound, split it into smaller meshlets
+ assert(chunk > max_vertices / 3);
+
+ bvhPackLeaf(boundary + i, max_vertices / 3);
+ i += max_vertices / 3;
+ }
+}
+
+static bool bvhDivisible(size_t count, size_t min, size_t max)
+{
+ // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
+ // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
+ // we avoid expensive integer divisions in the common case where min is <= max/2
+ return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
+}
+
+static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
+{
+ BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
+ BVHBoxT accumr = accuml;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ float larea = boxMerge(accuml, boxes[order[i]]);
+ float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
+
+ areas[i] = larea;
+ areas[i + count] = rarea;
+ }
+}
+
+static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
+{
+ bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
+ size_t end = aligned ? count - min : count - 1;
+
+ float rmaxfill = 1.f / float(int(maxfill));
+
+ // find best split that minimizes SAH
+ size_t bestsplit = 0;
+ float bestcost = FLT_MAX;
+
+ for (size_t i = min - 1; i < end; i += step)
+ {
+ size_t lsplit = i + 1, rsplit = count - (i + 1);
+
+ if (!bvhDivisible(lsplit, min, max))
+ continue;
+ if (aligned && !bvhDivisible(rsplit, min, max))
+ continue;
+
+ // areas[x] = inclusive surface area of boxes[0..x]
+ // areas[count-1-x] = inclusive surface area of boxes[x..count-1]
+ float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
+ float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
+
+ if (cost > bestcost)
+ continue;
+
+ // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
+ // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
+ size_t lfill = vertices ? vertices[i] : lsplit;
+ size_t rfill = vertices ? vertices[i] : rsplit;
+
+ // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
+ int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
+ int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
+
+ cost += fill * (float(lrest) * larea + float(rrest) * rarea);
+
+ if (cost < bestcost)
+ {
+ bestcost = cost;
+ bestsplit = i + 1;
+ }
+ }
+
+ *out_cost = bestcost;
+ return bestsplit;
+}
+
+static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+ size_t l = 0, r = split;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned char side = sides[order[i]];
+ target[side ? r : l] = order[i];
+ l += 1;
+ l -= side;
+ r += side;
+ }
+
+ assert(l == split && r == count);
+}
+
+static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
+ return bvhPackLeaf(boundary, count);
+
+ unsigned int* axes[3] = {orderx, ordery, orderz};
+
+ // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
+ size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
+
+ // if we could not pack the meshlet, we must be vertex bound
+ size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
+ size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
+
+ // find best split that minimizes SAH
+ int bestk = -1;
+ size_t bestsplit = 0;
+ float bestcost = FLT_MAX;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ float* areas = static_cast(scratch);
+ unsigned int* vertices = NULL;
+
+ bvhComputeArea(areas, boxes, axes[k], count);
+
+ if (count <= max_triangles)
+ {
+ // for vertex bound clusters, count number of unique vertices for each split
+ vertices = reinterpret_cast(areas + 2 * count);
+ bvhCountVertices(axes[k], count, used, indices, vertices);
+ }
+
+ float axiscost = FLT_MAX;
+ size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
+
+ if (axissplit && axiscost < bestcost)
+ {
+ bestk = k;
+ bestcost = axiscost;
+ bestsplit = axissplit;
+ }
+ }
+
+ // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
+ if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
+ return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
+
+ // mark sides of split for partitioning
+ unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int);
+
+ for (size_t i = 0; i < bestsplit; ++i)
+ sides[axes[bestk][i]] = 0;
+
+ for (size_t i = bestsplit; i < count; ++i)
+ sides[axes[bestk][i]] = 1;
+
+ // partition all axes into two sides, maintaining order
+ unsigned int* temp = static_cast(scratch);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ if (k == bestk)
+ continue;
+
+ unsigned int* axis = axes[k];
+ memcpy(temp, axis, sizeof(unsigned int) * count);
+ bvhPartition(axis, temp, sides, bestsplit, count);
+ }
+
+ // recursion depth is bounded due to max depth check above
+ bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+ bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
} // namespace meshopt
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
@@ -517,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
assert(index_count % 3 == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
(void)kMeshletMaxVertices;
(void)kMeshletMaxTriangles;
@@ -532,7 +1146,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
}
-size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
{
using namespace meshopt;
@@ -541,18 +1155,24 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
assert(vertex_positions_stride % sizeof(float) == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
- assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+ assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
assert(cone_weight >= 0 && cone_weight <= 1);
+ assert(split_factor >= 0);
+
+ if (index_count == 0)
+ return 0;
meshopt_Allocator allocator;
TriangleAdjacency2 adjacency = {};
- buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+ if (vertex_count > index_count && index_count < (1u << 31))
+ buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
+ else
+ buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
- unsigned int* live_triangles = allocator.allocate(vertex_count);
- memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+ // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+ unsigned int* live_triangles = adjacency.counts;
size_t face_count = index_count / 3;
@@ -573,11 +1193,45 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
kdindices[i] = unsigned(i);
KDNode* nodes = allocator.allocate(face_count * 2);
- kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
+ kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
- // index of the vertex in the meshlet, 0xff if the vertex isn't used
- unsigned char* used = allocator.allocate(vertex_count);
- memset(used, -1, vertex_count);
+ // find a specific corner of the mesh to use as a starting point for meshlet flow
+ float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ const Cone& tri = triangles[i];
+
+ cornerx = cornerx > tri.px ? tri.px : cornerx;
+ cornery = cornery > tri.py ? tri.py : cornery;
+ cornerz = cornerz > tri.pz ? tri.pz : cornerz;
+ }
+
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
+
+ // initial seed triangle is the one closest to the corner
+ unsigned int initial_seed = ~0u;
+ float initial_score = FLT_MAX;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ const Cone& tri = triangles[i];
+
+ float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
+ float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (initial_seed == ~0u || score < initial_score)
+ {
+ initial_seed = unsigned(i);
+ initial_score = score;
+ }
+ }
+
+ // seed triangles to continue meshlet flow
+ unsigned int seeds[kMeshletMaxSeeds] = {};
+ size_t seed_count = 0;
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
@@ -588,46 +1242,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
{
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
- unsigned int best_extra = 0;
- unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
+ unsigned int best_triangle = ~0u;
- // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
- if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
- {
- best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
- }
+ // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
+ // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
+ if (meshlet_offset == 0 && meshlet.triangle_count == 0)
+ best_triangle = initial_seed;
+ else
+ best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
- // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
+ bool split = false;
+
+ // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
if (best_triangle == ~0u)
{
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
unsigned int index = ~0u;
- float limit = FLT_MAX;
+ float distance = FLT_MAX;
- kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+ kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
best_triangle = index;
+ split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
}
if (best_triangle == ~0u)
break;
+ int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
+
+ // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
+ if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
+ {
+ seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
+ seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
+ seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
+
+ unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
+
+ // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
+ best_triangle = best_seed != ~0u ? best_seed : best_triangle;
+ }
+
unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
- if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
+ if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
{
meshlet_offset++;
memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
}
- live_triangles[a]--;
- live_triangles[b]--;
- live_triangles[c]--;
-
// remove emitted triangle from adjacency data
// this makes sure that we spend less time traversing these lists on subsequent iterations
+ // live triangle counts are updated as a byproduct of these adjustments
for (size_t k = 0; k < 3; ++k)
{
unsigned int index = indices[best_triangle * 3 + k];
@@ -656,20 +1325,23 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
meshlet_cone_acc.ny += triangles[best_triangle].ny;
meshlet_cone_acc.nz += triangles[best_triangle].nz;
+ assert(!emitted_flags[best_triangle]);
emitted_flags[best_triangle] = 1;
}
if (meshlet.triangle_count)
- {
- finishMeshlet(meshlet, meshlet_triangles);
-
meshlets[meshlet_offset++] = meshlet;
- }
- assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+ assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
return meshlet_offset;
}
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+ return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
+}
+
size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
{
using namespace meshopt;
@@ -678,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
meshopt_Allocator allocator;
- // index of the vertex in the meshlet, 0xff if the vertex isn't used
- unsigned char* used = allocator.allocate(vertex_count);
- memset(used, -1, vertex_count);
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
@@ -699,13 +1370,109 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
}
if (meshlet.triangle_count)
- {
- finishMeshlet(meshlet, meshlet_triangles);
-
meshlets[meshlet_offset++] = meshlet;
- }
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
+ return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+ assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
+
+ if (index_count == 0)
+ return 0;
+
+ size_t face_count = index_count / 3;
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ meshopt_Allocator allocator;
+
+ // 3 floats plus 1 uint for sorting, or
+ // 2 floats plus 1 uint for pivoting, or
+ // 1 uint plus 1 byte for partitioning
+ float* scratch = allocator.allocate(face_count * 4);
+
+ // compute bounding boxes and centroids for sorting
+ BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD
+ bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
+ memset(boxes + face_count, 0, sizeof(BVHBox));
+
+ unsigned int* axes = allocator.allocate(face_count * 3);
+ unsigned int* temp = reinterpret_cast(scratch) + face_count * 3;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ unsigned int* order = axes + k * face_count;
+ const float* keys = scratch + k * face_count;
+
+ unsigned int hist[1024][3];
+ computeHistogram(hist, keys, face_count);
+
+ // 3-pass radix sort computes the resulting order into axes
+ for (size_t i = 0; i < face_count; ++i)
+ temp[i] = unsigned(i);
+
+ radixPass(order, temp, keys, face_count, hist, 0);
+ radixPass(temp, order, keys, face_count, hist, 1);
+ radixPass(order, temp, keys, face_count, hist, 2);
+ }
+
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
+
+ unsigned char* boundary = allocator.allocate(face_count);
+
+ bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+
+ // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
+ size_t meshlet_count = 0;
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ assert(boundary[i] <= 1);
+ meshlet_count += boundary[i];
+ }
+
+ size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
+
+ // pack triangles into meshlets according to the order and boundaries marked by bvhSplit
+ meshopt_Meshlet meshlet = {};
+ size_t meshlet_offset = 0;
+ size_t meshlet_pending = meshlet_count;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ assert(boundary[i] <= 1);
+ bool split = i > 0 && boundary[i] == 1;
+
+ // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
+ if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
+ split = false;
+
+ unsigned int index = axes[i];
+ assert(index < face_count);
+
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ // appends triangle to the meshlet and writes previous meshlet to the output if full
+ meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
+ meshlet_pending -= boundary[i];
+ }
+
+ if (meshlet.triangle_count)
+ meshlets[meshlet_offset++] = meshlet;
+
+ assert(meshlet_offset <= meshlet_bound);
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
return meshlet_offset;
}
@@ -765,15 +1532,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
if (triangles == 0)
return bounds;
+ const float rzero = 0.f;
+
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
float psphere[4] = {};
- computeBoundingSphere(psphere, corners[0], triangles * 3);
+ computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
float center[3] = {psphere[0], psphere[1], psphere[2]};
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
float nsphere[4] = {};
- computeBoundingSphere(nsphere, normals, triangles);
+ computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@@ -883,6 +1652,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
}
+meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
+{
+ using namespace meshopt;
+
+ assert(positions_stride >= 12 && positions_stride <= 256);
+ assert(positions_stride % sizeof(float) == 0);
+ assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
+ assert(radii_stride % sizeof(float) == 0);
+
+ meshopt_Bounds bounds = {};
+
+ if (count == 0)
+ return bounds;
+
+ const float rzero = 0.f;
+
+ float psphere[4] = {};
+ computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
+
+ bounds.center[0] = psphere[0];
+ bounds.center[1] = psphere[1];
+ bounds.center[2] = psphere[2];
+ bounds.radius = psphere[3];
+
+ return bounds;
+}
+
void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
{
using namespace meshopt;
@@ -950,25 +1746,28 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
unsigned int order[kMeshletMaxVertices];
- unsigned char remap[kMeshletMaxVertices];
- memset(remap, -1, vertex_count);
+ short remap[kMeshletMaxVertices];
+ memset(remap, -1, vertex_count * sizeof(short));
size_t vertex_offset = 0;
for (size_t i = 0; i < triangle_count * 3; ++i)
{
- unsigned char& r = remap[indices[i]];
+ short& r = remap[indices[i]];
- if (r == 0xff)
+ if (r < 0)
{
- r = (unsigned char)(vertex_offset);
+ r = short(vertex_offset);
order[vertex_offset] = vertices[indices[i]];
vertex_offset++;
}
- indices[i] = r;
+ indices[i] = (unsigned char)r;
}
assert(vertex_offset <= vertex_count);
memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
diff --git a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
similarity index 58%
rename from Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
index 368274382..87ceeae66 100644
--- a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
@@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind
return result;
}
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ meshopt_VertexFetchStatistics result = {};
+
+ unsigned char* vertex_visited = allocator.allocate(vertex_count);
+ memset(vertex_visited, 0, vertex_count);
+
+ const size_t kCacheLine = 64;
+ const size_t kCacheSize = 128 * 1024;
+
+ // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+ size_t cache[kCacheSize / kCacheLine] = {};
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ vertex_visited[index] = 1;
+
+ size_t start_address = index * vertex_size;
+ size_t end_address = start_address + vertex_size;
+
+ size_t start_tag = start_address / kCacheLine;
+ size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+ assert(start_tag < end_tag);
+
+ for (size_t tag = start_tag; tag < end_tag; ++tag)
+ {
+ size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+ // we store +1 since cache is filled with 0 by default
+ result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+ cache[line] = tag + 1;
+ }
+ }
+
+ size_t unique_vertex_count = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ unique_vertex_count += vertex_visited[i];
+
+ result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+ return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/indexcodec.cpp b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
index b30046005..7a8fd6867 100644
--- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
const unsigned char kSequenceHeader = 0xd0;
static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;
typedef unsigned int VertexFifo[16];
typedef unsigned int EdgeFifo[16][2];
@@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
if (fer >= 0 && (fer >> 2) < 15)
{
+ // note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
const unsigned int* order = kTriangleIndexOrder[fer & 3];
unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
@@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+ // note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
int fea = (a == next) ? (next++, 0) : 15;
int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
@@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
void meshopt_encodeIndexVersion(int version)
{
- assert(unsigned(version) <= 1);
+ assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
meshopt::gEncodeIndexVersion = version;
}
+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+ if (buffer_size < 1)
+ return -1;
+
+ unsigned char header = buffer[0];
+
+ if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+ return -1;
+
+ int version = header & 0x0f;
+ if (version > meshopt::kDecodeIndexVersion)
+ return -1;
+
+ return version;
+}
+
int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
@@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
return -1;
int version = buffer[0] & 0x0f;
- if (version > 1)
+ if (version > kDecodeIndexVersion)
return -1;
EdgeFifo edgefifo;
@@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
// fifo reads are wrapped around 16 entry buffer
unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+ unsigned int c = 0;
int fec = codetri & 15;
@@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
{
// fifo reads are wrapped around 16 entry buffer
unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
- unsigned int c = (fec == 0) ? next : cf;
+ c = (fec == 0) ? next : cf;
int fec0 = fec == 0;
next += fec0;
- // output triangle
- writeTriangle(destination, i, index_size, a, b, c);
-
- // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
-
- pushEdgeFifo(edgefifo, c, b, edgefifooffset);
- pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
- unsigned int c = 0;
-
// fec - (fec ^ 3) decodes 13, 14 into -1, 1
// note that we need to update the last index since free indices are delta-encoded
last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
- // output triangle
- writeTriangle(destination, i, index_size, a, b, c);
-
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset);
-
- pushEdgeFifo(edgefifo, c, b, edgefifooffset);
- pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
+
+ // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
}
else
{
@@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
return -1;
int version = buffer[0] & 0x0f;
- if (version > 1)
+ if (version > kDecodeIndexVersion)
return -1;
const unsigned char* data = buffer + 1;
diff --git a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
index f6728345a..4bf9fccad 100644
--- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
@@ -5,7 +5,9 @@
#include
// This work is based on:
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
namespace meshopt
{
@@ -85,6 +87,46 @@ struct VertexStreamHasher
}
};
+struct VertexCustomHasher
+{
+ const float* vertex_positions;
+ size_t vertex_stride_float;
+
+ int (*callback)(void*, unsigned int, unsigned int);
+ void* context;
+
+ size_t hash(unsigned int index) const
+ {
+ const unsigned int* key = reinterpret_cast(vertex_positions + index * vertex_stride_float);
+
+ unsigned int x = key[0], y = key[1], z = key[2];
+
+ // replace negative zero with zero
+ x = (x == 0x80000000) ? 0 : x;
+ y = (y == 0x80000000) ? 0 : y;
+ z = (z == 0x80000000) ? 0 : z;
+
+ // scramble bits to make sure that integer coordinates have entropy in lower bits
+ x ^= x >> 17;
+ y ^= y >> 17;
+ z ^= z >> 17;
+
+ // Optimized Spatial Hashing for Collision Detection of Deformable Objects
+ return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ const float* lp = vertex_positions + lhs * vertex_stride_float;
+ const float* rp = vertex_positions + rhs * vertex_stride_float;
+
+ if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
+ return false;
+
+ return callback ? callback(context, lhs, rhs) : true;
+ }
+};
+
struct EdgeHasher
{
const unsigned int* remap;
@@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
allocator.deallocate(vertex_table);
}
+template
+static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices ? indices[i] : unsigned(i);
+ assert(index < vertex_count);
+
+ if (remap[index] != ~0u)
+ continue;
+
+ unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+ if (*entry == ~0u)
+ {
+ *entry = index;
+ remap[index] = next_vertex++;
+ }
+ else
+ {
+ assert(remap[*entry] != ~0u);
+ remap[index] = remap[*entry];
+ }
+ }
+
+ assert(next_vertex <= vertex_count);
+ return next_vertex;
+}
+
template
static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
{
@@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex
}
}
+template
+static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+ unsigned int* remap = allocator.allocate(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ if (remap[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+ if (*entry == ~0u)
+ *entry = index;
+
+ remap[index] = *entry;
+ }
+
+ destination[i] = remap[index];
+ }
+}
+
} // namespace meshopt
size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
-
- memset(destination, -1, vertex_count * sizeof(unsigned int));
-
VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_size};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
-
- unsigned int next_vertex = 0;
-
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices ? indices[i] : unsigned(i);
- assert(index < vertex_count);
-
- if (destination[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
- if (*entry == ~0u)
- {
- *entry = index;
-
- destination[index] = next_vertex++;
- }
- else
- {
- assert(destination[*entry] != ~0u);
-
- destination[index] = destination[*entry];
- }
- }
- }
-
- assert(next_vertex <= vertex_count);
-
- return next_vertex;
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
}
meshopt_Allocator allocator;
-
- memset(destination, -1, vertex_count * sizeof(unsigned int));
-
VertexStreamHasher hasher = {streams, stream_count};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
- unsigned int next_vertex = 0;
+size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
+{
+ using namespace meshopt;
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices ? indices[i] : unsigned(i);
- assert(index < vertex_count);
+ assert(indices || index_count == vertex_count);
+ assert(!indices || index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
- if (destination[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+ meshopt_Allocator allocator;
+ VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};
- if (*entry == ~0u)
- {
- *entry = index;
-
- destination[index] = next_vertex++;
- }
- else
- {
- assert(destination[*entry] != ~0u);
-
- destination[index] = destination[*entry];
- }
- }
- }
-
- assert(next_vertex <= vertex_count);
-
- return next_vertex;
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
@@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned
assert(vertex_size <= vertex_stride);
meshopt_Allocator allocator;
-
- unsigned int* remap = allocator.allocate(vertex_count);
- memset(remap, -1, vertex_count * sizeof(unsigned int));
-
VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_stride};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
-
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices[i];
- assert(index < vertex_count);
-
- if (remap[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
- if (*entry == ~0u)
- *entry = index;
-
- remap[index] = *entry;
- }
-
- destination[i] = remap[index];
- }
+ generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
}
meshopt_Allocator allocator;
-
- unsigned int* remap = allocator.allocate(vertex_count);
- memset(remap, -1, vertex_count * sizeof(unsigned int));
-
VertexStreamHasher hasher = {streams, stream_count};
+ generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+ VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
+
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
- for (size_t i = 0; i < index_count; ++i)
+ for (size_t i = 0; i < vertex_count; ++i)
{
- unsigned int index = indices[i];
- assert(index < vertex_count);
+ unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
- if (remap[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+ if (*entry == ~0u)
+ *entry = unsigned(i);
- if (*entry == ~0u)
- *entry = index;
-
- remap[index] = *entry;
- }
-
- destination[i] = remap[index];
+ destination[i] = *entry;
}
}
@@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
memcpy(destination + i * 4, patch, sizeof(patch));
}
}
+
+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+ assert(index_count % 3 == 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* remap = allocator.allocate(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ // compute vertex valence; this is used to prioritize least used corner
+ // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+ unsigned char* valence = allocator.allocate(vertex_count);
+ memset(valence, 0, vertex_count);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ valence[index]++;
+ }
+
+ unsigned int reorder_offset = 0;
+
+ // assign provoking vertices; leave the rest for the next pass
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ // try to rotate triangle such that provoking vertex hasn't been seen before
+ // if multiple vertices are new, prioritize the one with least valence
+ // this reduces the risk that a future triangle will have all three vertices seen
+ unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
+ unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
+ unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
+
+ if (vb != ~0u && vb <= va && vb <= vc)
+ {
+ // abc -> bca
+ unsigned int t = a;
+ a = b, b = c, c = t;
+ }
+ else if (vc != ~0u && vc <= va && vc <= vb)
+ {
+ // abc -> cab
+ unsigned int t = c;
+ c = b, b = a, a = t;
+ }
+
+ unsigned int newidx = reorder_offset;
+
+ // now remap[a] = ~0u or all three vertices are old
+ // recording remap[a] makes it possible to remap future references to the same index, conserving space
+ if (remap[a] == ~0u)
+ remap[a] = newidx;
+
+ // we need to clone the provoking vertex to get a unique index
+ // if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
+ reorder[reorder_offset++] = a;
+
+ // note: first vertex is final, the other two will be fixed up in next pass
+ destination[i + 0] = newidx;
+ destination[i + 1] = b;
+ destination[i + 2] = c;
+
+ // update vertex valences for corner heuristic
+ valence[a]--;
+ valence[b]--;
+ valence[c]--;
+ }
+
+ // remap or clone non-provoking vertices (iterating to skip provoking vertices)
+ int step = 1;
+
+ for (size_t i = 1; i < index_count; i += step, step ^= 3)
+ {
+ unsigned int index = destination[i];
+
+ if (remap[index] == ~0u)
+ {
+ // we haven't seen the vertex before as a provoking vertex
+ // to maintain the reference to the original vertex we need to clone it
+ unsigned int newidx = reorder_offset;
+
+ remap[index] = newidx;
+ reorder[reorder_offset++] = index;
+ }
+
+ destination[i] = remap[index];
+ }
+
+ assert(reorder_offset <= vertex_count + index_count / 3);
+ return reorder_offset;
+}
diff --git a/Source/ThirdParty/meshoptimizer/meshoptimizer.h b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
index 6c8dcd7e8..c9239bc30 100644
--- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h
+++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
/**
- * meshoptimizer - version 0.21
+ * meshoptimizer - version 1.0
*
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at https://github.com/zeux/meshoptimizer
*
* This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
#include
/* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
+#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */
/* If no API is defined, assume default */
#ifndef MESHOPTIMIZER_API
@@ -29,11 +29,14 @@
#endif
/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#ifndef MESHOPTIMIZER_EXPERIMENTAL
#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+#endif
/* C interface */
#ifdef __cplusplus
-extern "C" {
+extern "C"
+{
#endif
/**
@@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
*/
MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
+
/**
* Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
*
@@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
*/
MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+/**
+ * Generates a remap table that maps all vertices with the same position to the same (existing) index.
+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
/**
* Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
* Each triangle is converted into a 6-vertex patch with the following layout:
@@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
*/
MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
+ * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
+ * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
+ * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
+ * For maximum efficiency the input index buffer should be optimized for vertex cache first.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
/**
* Vertex transform cache optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
*/
@@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
* Vertex transform cache optimizer for FIFO caches
* Reorders indices to reduce the number of GPU vertex shader invocations
* Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* cache_size should be less than the actual GPU cache size to avoid cache thrashing
@@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
/**
* Overdraw optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
@@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
* Vertex fetch cache optimizer
* Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
* Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
- * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count elements)
* indices is used both as an input and as an output index buffer
@@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
/**
- * Set index encoder format version
+ * Set index encoder format version (defaults to 1)
+ *
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
*/
MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
@@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
*/
MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+/**
+ * Get encoded index format version
+ * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
+
/**
* Index sequence encoder
* Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
* Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
* This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
* Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
*
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
/**
- * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
+
+/**
+ * Set vertex encoder format version (defaults to 1)
+ *
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
*/
MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
@@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
* The decoder is safe to use for untrusted input, but it may produce garbage data.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ * vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+/**
+ * Get encoded vertex format version
+ * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
+
/**
* Vertex buffer filters
* These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
*
- * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
*
- * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
* Each component is stored as an 16-bit integer; stride must be equal to 8.
*
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ *
+ * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
/**
* Vertex buffer filter encoders
* These functions can be used to encode data in a format that meshopt_decodeFilter can decode
*
- * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
- * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
* Input data must contain 4 floats for every vector (count*4 total).
*
* meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
@@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
* meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
+ *
+ * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
+ * Input data must contain 4 floats for every color (count*4 total).
*/
enum meshopt_EncodeExpMode
{
@@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode
meshopt_EncodeExpSharedVector,
/* When encoding exponents, use shared value for each component of all vectors (best compression) */
meshopt_EncodeExpSharedComponent,
+ /* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+ meshopt_EncodeExpClamped,
};
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
/**
* Simplification options
@@ -334,16 +416,34 @@ enum
meshopt_SimplifySparse = 1 << 1,
/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
meshopt_SimplifyErrorAbsolute = 1 << 2,
+ /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
+ meshopt_SimplifyPrune = 1 << 3,
+ /* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
+ meshopt_SimplifyRegularize = 1 << 4,
+ /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
+ meshopt_SimplifyPermissive = 1 << 5,
+};
+
+/**
+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
+ */
+enum
+{
+ /* Do not move this vertex. */
+ meshopt_SimplifyVertex_Lock = 1 << 0,
+ /* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
+ meshopt_SimplifyVertex_Protect = 1 << 1,
};
/**
* Mesh simplifier
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
* Returns the number of indices after simplification, with destination containing new index data
+ *
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
@@ -354,45 +454,94 @@ enum
MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
- * Experimental: Mesh simplifier with attribute metric
- * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ * Mesh simplifier with attribute metric
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
*
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_attributes should have attribute_count floats for each vertex
- * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
- * attribute_count must be <= 16
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
* vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
- * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
- * Experimental: Mesh simplifier (sloppy)
+ * Mesh simplifier with position/attribute update
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, indices are destructively updated with new index data
+ *
+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
+/**
+ * Mesh simplifier (sloppy)
* Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
* The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
* Returns the number of indices after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
/**
- * Experimental: Point cloud simplifier
+ * Mesh simplifier (pruner)
+ * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+
+/**
+ * Point cloud simplifier
* Reduces the number of points in the cloud to reach the given target
* Returns the number of points after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer (target_vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * color_weight determines relative priority of color wrt position; 1.0 is a safe default
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
+MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
/**
* Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics
*/
MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+struct meshopt_VertexFetchStatistics
+{
+ unsigned int bytes_fetched;
+ float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
struct meshopt_OverdrawStatistics
{
unsigned int pixels_covered;
@@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics
*/
MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
-struct meshopt_VertexFetchStatistics
+struct meshopt_CoverageStatistics
{
- unsigned int bytes_fetched;
- float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+ float coverage[3];
+ float extent; /* viewport size in mesh coordinates */
};
/**
- * Vertex fetch cache analyzer
- * Returns cache hit statistics using a simplified direct mapped model
- * Results may not match actual GPU performance
+ * Coverage analyzer
+ * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
-MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Meshlet is a small mesh cluster (subset) that consists of:
+ * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
+ * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
+ *
+ * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
+ */
struct meshopt_Meshlet
{
/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
unsigned int vertex_offset;
unsigned int triangle_offset;
- /* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+ /* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
unsigned int vertex_count;
unsigned int triangle_count;
};
@@ -484,14 +654,15 @@ struct meshopt_Meshlet
* Meshlet builder
* Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
* The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
* When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
* When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
*
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
*/
MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
/**
- * Experimental: Meshlet optimizer
- * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
+ * Meshlet builder with flexible cluster sizes
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
+ * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
*
- * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
- * need to be computed from meshlet's vertex_offset and triangle_offset
- * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
+ * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+
+/**
+ * Meshlet builder that produces clusters optimized for raytracing
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
+ *
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
+ */
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+
+/**
+ * Meshlet optimizer
+ * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
+ *
+ * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
struct meshopt_Bounds
{
@@ -544,11 +742,35 @@ struct meshopt_Bounds
* Real-Time Rendering 4th Edition, section 19.3).
*
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
+ * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
+ * index_count/3 and triangle_count must not exceed implementation limits (<= 512)
*/
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Sphere bounds generator
+ * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
+ *
+ * positions should have float3 position in the first 12 bytes of each element
+ * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
+ */
+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
+
+/**
+ * Cluster partitioner
+ * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
+ * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
+ *
+ * destination must contain enough space for the resulting partition data (cluster_count elements)
+ * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
+ * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
+ * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
+ * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
+ */
+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+
/**
* Spatial sorter
* Generates a remap table that can be used to reorder points for spatial locality.
@@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig
MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
- * Experimental: Spatial sorter
+ * Spatial sorter
* Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Spatial clusterizer
+ * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
+ * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
+ *
+ * destination must contain enough space for the resulting index buffer (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
+
+/**
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
+ * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
/**
* Set allocation callbacks
@@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
* Note that all algorithms only allocate memory for temporary use.
* allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
*/
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
#ifdef __cplusplus
} /* extern "C" */
#endif
-/* Quantization into commonly supported data formats */
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
#ifdef __cplusplus
/**
* Quantize a float in [0..1] range into an N-bit fixed point unorm value
@@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N);
* Maximum reconstruction error: 1/2^N
*/
inline int meshopt_quantizeSnorm(float v, int N);
-
-/**
- * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Representable magnitude range: [6e-5; 65504]
- * Maximum relative reconstruction error: 5e-4
- */
-MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
-
-/**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Assumes N is in a valid mantissa precision range, which is 1..23
- */
-MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
-
-/**
- * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
- * Preserves Inf/NaN, flushes denormals to zero
- */
-MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
#endif
/**
@@ -631,6 +863,10 @@ template
inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
template
inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
template
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
template
@@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
template
inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
+template
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
template
inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -661,29 +899,44 @@ template
inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
template
inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
template
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template
inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
+template
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
template
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
+template
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+template
inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
template
inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
template
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+template
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
template
inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
template
inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
template
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+template
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+template
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+template
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
#endif
@@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N)
class meshopt_Allocator
{
public:
- template
- struct StorageT
+ struct Storage
{
- static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
- static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
+ void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
+ void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
};
- typedef StorageT Storage;
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+ MESHOPTIMIZER_API static Storage& storage();
+#else
+ static Storage& storage()
+ {
+ static Storage s = {::operator new, ::operator delete };
+ return s;
+ }
+#endif
meshopt_Allocator()
- : blocks()
- , count(0)
+ : blocks()
+ , count(0)
{
}
~meshopt_Allocator()
{
for (size_t i = count; i > 0; --i)
- Storage::deallocate(blocks[i - 1]);
+ storage().deallocate(blocks[i - 1]);
}
- template T* allocate(size_t size)
+ template
+ T* allocate(size_t size)
{
assert(count < sizeof(blocks) / sizeof(blocks[0]));
- T* result = static_cast(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+ T* result = static_cast(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
blocks[count++] = result;
return result;
}
@@ -749,7 +1010,7 @@ public:
void deallocate(void* ptr)
{
assert(count > 0 && blocks[count - 1] == ptr);
- Storage::deallocate(ptr);
+ storage().deallocate(ptr);
count--;
}
@@ -757,10 +1018,6 @@ private:
void* blocks[24];
size_t count;
};
-
-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
-template void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::allocate)(size_t) = operator new;
-template void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::deallocate)(void*) = operator delete;
#endif
/* Inline implementation for C++ templated wrappers */
@@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter
{
size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
- data = static_cast(meshopt_Allocator::Storage::allocate(size));
+ data = static_cast(meshopt_Allocator::storage().allocate(size));
if (input)
{
@@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter
result[i] = T(data[i]);
}
- meshopt_Allocator::Storage::deallocate(data);
+ meshopt_Allocator::storage().deallocate(data);
}
};
@@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const
return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
}
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+ struct Call
+ {
+ static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; }
+ };
+
+ return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+ struct Call
+ {
+ static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; }
+ };
+
+ meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0);
+
+ return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
template
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
{
@@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
+template
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ size_t bound = vertex_count + (index_count / 3);
+ assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
+ (void)bound;
+
+ return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
+}
+
template
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
{
@@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
}
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
+{
+ return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
+}
+
template
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
{
@@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
}
+template
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+ meshopt_IndexAdapter inout(indices, indices, index_count);
+
+ return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
+}
+
template
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
meshopt_IndexAdapter out(destination, NULL, index_count);
- return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
+ return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
+}
+
+template
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
+}
+
+template
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
}
template
@@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
}
template
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
- return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+ return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
+}
+
+template
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
}
template
@@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
}
template
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
- return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+ return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
template
@@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
}
+template
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
+}
+
+template
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
template
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
+template
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+ meshopt_IndexAdapter in(NULL, cluster_indices, total_index_count);
+
+ return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size);
+}
+
template
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
#endif
/**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
diff --git a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
index cc22dbcff..682b924a9 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
namespace meshopt
{
-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float mesh_centroid[3] = {};
- for (size_t i = 0; i < index_count; ++i)
+ for (size_t i = 0; i < vertex_count; ++i)
{
- const float* p = vertex_positions + vertex_stride_float * indices[i];
+ const float* p = vertex_positions + vertex_stride_float * i;
mesh_centroid[0] += p[0];
mesh_centroid[1] += p[1];
mesh_centroid[2] += p[2];
}
- mesh_centroid[0] /= index_count;
- mesh_centroid[1] /= index_count;
- mesh_centroid[2] /= index_count;
+ mesh_centroid[0] /= float(vertex_count);
+ mesh_centroid[1] /= float(vertex_count);
+ mesh_centroid[2] /= float(vertex_count);
for (size_t cluster = 0; cluster < cluster_count; ++cluster)
{
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
// fill sort data
float* sort_data = allocator.allocate(cluster_count);
- calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+ calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
// sort clusters using sort data
unsigned short* sort_keys = allocator.allocate(cluster_count);
diff --git a/Source/ThirdParty/meshoptimizer/partition.cpp b/Source/ThirdParty/meshoptimizer/partition.cpp
new file mode 100644
index 000000000..4119a53ed
--- /dev/null
+++ b/Source/ThirdParty/meshoptimizer/partition.cpp
@@ -0,0 +1,624 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include
+#include
+#include
+
+// This work is based on:
+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
+namespace meshopt
+{
+
+// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
+const int kMergeDepthCutoff = 40;
+
+struct ClusterAdjacency
+{
+ unsigned int* offsets;
+ unsigned int* clusters;
+ unsigned int* shared;
+};
+
+static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
+{
+ (void)vertex_count;
+ (void)total_index_count;
+
+ size_t cluster_start = 0;
+ size_t cluster_write = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ offsets[i] = unsigned(cluster_write);
+
+ // copy cluster indices, skipping duplicates
+ for (size_t j = 0; j < cluster_index_counts[i]; ++j)
+ {
+ unsigned int v = cluster_indices[cluster_start + j];
+ assert(v < vertex_count);
+
+ data[cluster_write] = v;
+ cluster_write += 1 - used[v];
+ used[v] = 1;
+ }
+
+ // reset used flags for the next cluster
+ for (size_t j = offsets[i]; j < cluster_write; ++j)
+ used[data[j]] = 0;
+
+ cluster_start += cluster_index_counts[i];
+ }
+
+ assert(cluster_start == total_index_count);
+ assert(cluster_write <= total_index_count);
+ offsets[cluster_count] = unsigned(cluster_write);
+}
+
+static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
+{
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ float center[3] = {0, 0, 0};
+
+ // approximate center of the cluster by averaging all vertex positions
+ for (size_t j = 0; j < index_count; ++j)
+ {
+ const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+ center[0] += p[0];
+ center[1] += p[1];
+ center[2] += p[2];
+ }
+
+ // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+ if (index_count)
+ {
+ center[0] /= float(index_count);
+ center[1] /= float(index_count);
+ center[2] /= float(index_count);
+ }
+
+ // compute radius of the bounding sphere for each cluster
+ float radiussq = 0;
+
+ for (size_t j = 0; j < index_count; ++j)
+ {
+ const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+ float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+ radiussq = radiussq < d2 ? d2 : radiussq;
+ }
+
+ memcpy(out_center, center, sizeof(center));
+ return sqrtf(radiussq);
+}
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+ unsigned int* ref_offsets = allocator.allocate(vertex_count + 1);
+
+ // compute number of clusters referenced by each vertex
+ memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ ref_offsets[cluster_indices[j]]++;
+ }
+
+ // compute (worst-case) number of adjacent clusters for each cluster
+ size_t total_adjacency = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ size_t count = 0;
+
+ // worst case is every vertex has a disjoint cluster list
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ count += ref_offsets[cluster_indices[j]] - 1;
+
+ // ... but only every other cluster can be adjacent in the end
+ total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+ }
+
+ // we can now allocate adjacency buffers
+ adjacency.offsets = allocator.allocate(cluster_count + 1);
+ adjacency.clusters = allocator.allocate(total_adjacency);
+ adjacency.shared = allocator.allocate(total_adjacency);
+
+ // convert ref counts to offsets
+ size_t total_refs = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ size_t count = ref_offsets[i];
+ ref_offsets[i] = unsigned(total_refs);
+ total_refs += count;
+ }
+
+ unsigned int* ref_data = allocator.allocate(total_refs);
+
+ // fill cluster refs for each vertex
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
+ }
+
+ // after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+ memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+ ref_offsets[0] = 0;
+
+ // fill cluster adjacency for each cluster...
+ adjacency.offsets[0] = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+ unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+ size_t count = 0;
+
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ {
+ unsigned int v = cluster_indices[j];
+
+ // merge the entire cluster list of each vertex into current list
+ for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+ {
+ unsigned int c = ref_data[k];
+ assert(c < cluster_count);
+
+ if (c == unsigned(i))
+ continue;
+
+ // if the cluster is already in the list, increment the shared count
+ bool found = false;
+ for (size_t l = 0; l < count; ++l)
+ if (adj[l] == c)
+ {
+ found = true;
+ shd[l]++;
+ break;
+ }
+
+ // .. or append a new cluster
+ if (!found)
+ {
+ adj[count] = c;
+ shd[count] = 1;
+ count++;
+ }
+ }
+ }
+
+ // mark the end of the adjacency list; the next cluster will start there as well
+ adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+ }
+
+ assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+ // ref_offsets can't be deallocated as it was allocated before adjacency
+ allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+ int group;
+ int next;
+ unsigned int size; // 0 unless root
+ unsigned int vertices;
+
+ float center[3];
+ float radius;
+};
+
+struct GroupOrder
+{
+ unsigned int id;
+ int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+ // insert a new element at the end (breaks heap invariant)
+ heap[size++] = item;
+
+ // bubble up the new element to its correct position
+ size_t i = size - 1;
+ while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+ {
+ size_t p = (i - 1) / 2;
+
+ GroupOrder temp = heap[i];
+ heap[i] = heap[p];
+ heap[p] = temp;
+ i = p;
+ }
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+ assert(size > 0);
+ GroupOrder top = heap[0];
+
+ // move the last element to the top (breaks heap invariant)
+ heap[0] = heap[--size];
+
+ // bubble down the new top element to its correct position
+ size_t i = 0;
+ while (i * 2 + 1 < size)
+ {
+ // find the smallest child
+ size_t j = i * 2 + 1;
+ j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+ // if the parent is already smaller than both children, we're done
+ if (heap[j].order >= heap[i].order)
+ break;
+
+ // otherwise, swap the parent and child and continue
+ GroupOrder temp = heap[i];
+ heap[i] = heap[j];
+ heap[j] = temp;
+ i = j;
+ }
+
+ return top;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+ unsigned int total = 0;
+
+ for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+ for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+ {
+ for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+ if (adjacency.clusters[adj] == unsigned(i2))
+ {
+ total += adjacency.shared[adj];
+ break;
+ }
+ }
+
+ return total;
+}
+
+static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
+{
+ float r1 = target.radius, r2 = source.radius;
+ float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+ float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (d + r1 < r2)
+ {
+ target.center[0] = source.center[0];
+ target.center[1] = source.center[1];
+ target.center[2] = source.center[2];
+ target.radius = source.radius;
+ return;
+ }
+
+ if (d + r2 > r1)
+ {
+ float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
+
+ target.center[0] += dx * k;
+ target.center[1] += dy * k;
+ target.center[2] += dz * k;
+ target.radius = (d + r2 + r1) / 2;
+ }
+}
+
+static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
+{
+ float r1 = target.radius, r2 = source.radius;
+ float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+ float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
+
+ return mr > 0 ? r1 / mr : 0.f;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
+{
+ assert(groups[id].size > 0);
+
+ float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+ int best_group = -1;
+ float best_score = 0;
+
+ for (int ci = id; ci >= 0; ci = groups[ci].next)
+ {
+ for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+ {
+ int other = groups[adjacency.clusters[adj]].group;
+ if (other < 0)
+ continue;
+
+ assert(groups[other].size > 0);
+ if (groups[id].size + groups[other].size > max_partition_size)
+ continue;
+
+ unsigned int shared = countShared(groups, id, other, adjacency);
+ float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+ // normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+ float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+ // incorporate spatial score to favor merging nearby groups
+ if (use_bounds)
+ score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
+
+ if (score > best_score)
+ {
+ best_group = other;
+ best_score = score;
+ }
+ }
+ }
+
+ return best_group;
+}
+
+static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
+{
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = order[i];
+ if (groups[id].size == 0 || groups[id].size >= target_partition_size)
+ continue;
+
+ float best_score = -1.f;
+ int best_group = -1;
+
+ for (size_t j = 0; j < count; ++j)
+ {
+ unsigned int other = order[j];
+ if (id == other || groups[other].size == 0)
+ continue;
+
+ if (groups[id].size + groups[other].size > max_partition_size)
+ continue;
+
+ // favor merging nearby groups
+ float score = boundsScore(groups[id], groups[other]);
+
+ if (score > best_score)
+ {
+ best_score = score;
+ best_group = other;
+ }
+ }
+
+ // merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
+ if (best_group != -1)
+ {
+ // combine groups by linking them together
+ unsigned int tail = best_group;
+ while (groups[tail].next >= 0)
+ tail = groups[tail].next;
+
+ groups[tail].next = id;
+
+ // update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
+ groups[best_group].size += groups[id].size;
+ groups[id].size = 0;
+
+ // merge bounding spheres
+ mergeBounds(groups[best_group], groups[id]);
+ groups[id].radius = 0.f;
+ }
+ }
+}
+
+static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
+{
+ size_t m = 0;
+
+ // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+ for (size_t i = 0; i < count; ++i)
+ {
+ float v = groups[order[i]].center[axis];
+
+ // swap(m, i) unconditionally
+ unsigned int t = order[m];
+ order[m] = order[i];
+ order[i] = t;
+
+ // when v >= pivot, we swap i with m without advancing it, preserving invariants
+ m += v < pivot;
+ }
+
+ return m;
+}
+
+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
+{
+ size_t total = 0;
+ for (size_t i = 0; i < count; ++i)
+ total += groups[order[i]].size;
+
+ if (total <= max_partition_size || count <= leaf_size)
+ return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
+
+ float mean[3] = {};
+ float vars[3] = {};
+ float runc = 1, runs = 1;
+
+ // gather statistics on the points in the subtree using Welford's algorithm
+ for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+ {
+ const float* point = groups[order[i]].center;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ float delta = point[k] - mean[k];
+ mean[k] += delta * runs;
+ vars[k] += delta * (point[k] - mean[k]);
+ }
+ }
+
+ // split axis is one where the variance is largest
+ int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+
+ float split = mean[axis];
+ size_t middle = mergePartition(order, count, groups, axis, split);
+
+ // enforce balance for degenerate partitions
+ // this also ensures recursion depth is bounded on pathological inputs
+ if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
+ middle = count / 2;
+
+ // recursion depth is logarithmic and bounded due to max depth check above
+ mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+ mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+ using namespace meshopt;
+
+ assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_partition_size > 0);
+
+ size_t max_partition_size = target_partition_size + target_partition_size / 3;
+
+ meshopt_Allocator allocator;
+
+ unsigned char* used = allocator.allocate(vertex_count);
+ memset(used, 0, vertex_count);
+
+ unsigned int* cluster_newindices = allocator.allocate(total_index_count);
+ unsigned int* cluster_offsets = allocator.allocate(cluster_count + 1);
+
+ // make new cluster index list that filters out duplicate indices
+ filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
+ cluster_indices = cluster_newindices;
+
+ // build cluster adjacency along with edge weights (shared vertex count)
+ ClusterAdjacency adjacency = {};
+ buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
+
+ ClusterGroup* groups = allocator.allocate(cluster_count);
+ memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
+
+ GroupOrder* order = allocator.allocate(cluster_count);
+ size_t pending = 0;
+
+ // create a singleton group for each cluster and order them by priority
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ groups[i].group = int(i);
+ groups[i].next = -1;
+ groups[i].size = 1;
+ groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
+ assert(groups[i].vertices > 0);
+
+ // compute bounding sphere for each cluster if positions are provided
+ if (vertex_positions)
+ groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
+
+ GroupOrder item = {};
+ item.id = unsigned(i);
+ item.order = groups[i].vertices;
+
+ heapPush(order, pending++, item);
+ }
+
+ // iteratively merge the smallest group with the best group
+ while (pending)
+ {
+ GroupOrder top = heapPop(order, pending--);
+
+ // this group was merged into another group earlier
+ if (groups[top.id].size == 0)
+ continue;
+
+ // disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+ for (int i = top.id; i >= 0; i = groups[i].next)
+ {
+ assert(groups[i].group == int(top.id));
+ groups[i].group = -1;
+ }
+
+ // the group is large enough, emit as is
+ if (groups[top.id].size >= target_partition_size)
+ continue;
+
+ int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
+
+ // we can't grow the group any more, emit as is
+ if (best_group == -1)
+ continue;
+
+ // compute shared vertices to adjust the total vertices estimate after merging
+ unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+ // combine groups by linking them together
+ unsigned int tail = top.id;
+ while (groups[tail].next >= 0)
+ tail = groups[tail].next;
+
+ groups[tail].next = best_group;
+
+ // update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
+ groups[top.id].size += groups[best_group].size;
+ groups[top.id].vertices += groups[best_group].vertices;
+ groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+ groups[best_group].size = 0;
+ groups[best_group].vertices = 0;
+
+ // merge bounding spheres if bounds are available
+ if (vertex_positions)
+ {
+ mergeBounds(groups[top.id], groups[best_group]);
+ groups[best_group].radius = 0;
+ }
+
+ // re-associate all clusters back to the merged group
+ for (int i = top.id; i >= 0; i = groups[i].next)
+ groups[i].group = int(top.id);
+
+ top.order = groups[top.id].vertices;
+ heapPush(order, pending++, top);
+ }
+
+ // if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
+ if (vertex_positions)
+ {
+ unsigned int* merge_order = reinterpret_cast(order);
+ size_t merge_offset = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ if (groups[i].size)
+ merge_order[merge_offset++] = unsigned(i);
+
+ mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
+ }
+
+ // output each remaining group
+ size_t next_group = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ if (groups[i].size == 0)
+ continue;
+
+ for (int j = int(i); j >= 0; j = groups[j].next)
+ destination[j] = unsigned(next_group);
+
+ next_group++;
+ }
+
+ assert(next_group <= cluster_count);
+ return next_group;
+}
diff --git a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
similarity index 62%
rename from Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/rasterizer.cpp
index 31cf6f146..bd788ffdb 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
@@ -18,14 +18,6 @@ struct OverdrawBuffer
unsigned int overdraw[kViewport][kViewport][2];
};
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef max
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
{
// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
@@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1,
float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
float invdet = (det == 0) ? 0 : 1 / det;
- dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
- dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+ dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
+ dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;
return det;
}
@@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
// bounding rectangle, clipped against viewport
// since we rasterize pixels with covered centers, min >0.5 should round up
// as for max, due to top-left filling convention we will never rasterize right/bottom edges
- // so max >= 0.5 should round down
- int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
- int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
- int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
- int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+ // so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
+ int minx = X1 < X2 ? X1 : X2;
+ minx = minx < X3 ? minx : X3;
+ minx = (minx + 7) >> 4;
+ minx = minx < 0 ? 0 : minx;
+
+ int miny = Y1 < Y2 ? Y1 : Y2;
+ miny = miny < Y3 ? miny : Y3;
+ miny = (miny + 7) >> 4;
+ miny = miny < 0 ? 0 : miny;
+
+ int maxx = X1 > X2 ? X1 : X2;
+ maxx = maxx > X3 ? maxx : X3;
+ maxx = (maxx + 7) >> 4;
+ maxx = maxx > kViewport ? kViewport : maxx;
+
+ int maxy = Y1 > Y2 ? Y1 : Y2;
+ maxy = maxy > Y3 ? maxy : Y3;
+ maxy = (maxy + 7) >> 4;
+ maxy = maxy > kViewport ? kViewport : maxy;
// deltas, 28.4 fixed point
int DX12 = X1 - X2;
@@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
}
}
-} // namespace meshopt
-
-meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
- using namespace meshopt;
-
- assert(index_count % 3 == 0);
- assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
- assert(vertex_positions_stride % sizeof(float) == 0);
-
- meshopt_Allocator allocator;
-
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
- meshopt_OverdrawStatistics result = {};
-
float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
@@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
for (int j = 0; j < 3; ++j)
{
- minv[j] = min(minv[j], v[j]);
- maxv[j] = max(maxv[j], v[j]);
+ float vj = v[j];
+
+ minv[j] = minv[j] > vj ? vj : minv[j];
+ maxv[j] = maxv[j] < vj ? vj : maxv[j];
}
}
- float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
- float scale = kViewport / extent;
+ float extent = 0.f;
- float* triangles = allocator.allocate(index_count * 3);
+ extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+ extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+ extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+ float scale = kViewport / extent;
for (size_t i = 0; i < index_count; ++i)
{
@@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
}
+ return extent;
+}
+
+static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
+{
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ const float* vn0 = &triangles[3 * (i + 0)];
+ const float* vn1 = &triangles[3 * (i + 1)];
+ const float* vn2 = &triangles[3 * (i + 2)];
+
+ switch (axis)
+ {
+ case 0:
+ rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+ break;
+ case 1:
+ rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+ break;
+ case 2:
+ rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+ break;
+ }
+ }
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ meshopt_OverdrawStatistics result = {};
+
+ float* triangles = allocator.allocate(index_count * 3);
+ transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
OverdrawBuffer* buffer = allocator.allocate(1);
for (int axis = 0; axis < 3; ++axis)
{
memset(buffer, 0, sizeof(OverdrawBuffer));
-
- for (size_t i = 0; i < index_count; i += 3)
- {
- const float* vn0 = &triangles[3 * (i + 0)];
- const float* vn1 = &triangles[3 * (i + 1)];
- const float* vn2 = &triangles[3 * (i + 2)];
-
- switch (axis)
- {
- case 0:
- rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
- break;
- case 1:
- rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
- break;
- case 2:
- rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
- break;
- }
- }
+ rasterizeTriangles(buffer, triangles, index_count, axis);
for (int y = 0; y < kViewport; ++y)
for (int x = 0; x < kViewport; ++x)
@@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
return result;
}
+
+meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ meshopt_CoverageStatistics result = {};
+
+ float* triangles = allocator.allocate(index_count * 3);
+ float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+ OverdrawBuffer* buffer = allocator.allocate(1);
+
+ for (int axis = 0; axis < 3; ++axis)
+ {
+ memset(buffer, 0, sizeof(OverdrawBuffer));
+ rasterizeTriangles(buffer, triangles, index_count, axis);
+
+ unsigned int covered = 0;
+
+ for (int y = 0; y < kViewport; ++y)
+ for (int x = 0; x < kViewport; ++x)
+ covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
+
+ result.coverage[axis] = float(covered) / float(kViewport * kViewport);
+ }
+
+ result.extent = extent;
+
+ return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/simplifier.cpp b/Source/ThirdParty/meshoptimizer/simplifier.cpp
index e59b4afcd..14d4d42fe 100644
--- a/Source/ThirdParty/meshoptimizer/simplifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp
@@ -27,6 +27,7 @@
// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
+// Hugues Hoppe, Steve Marschner. Efficient Minimization of New Quadric Metric for Simplifying Meshes with Appearance Attributes. 2000
namespace meshopt
{
@@ -118,10 +119,17 @@ struct PositionHasher
unsigned int ri = sparse_remap ? sparse_remap[index] : index;
const unsigned int* key = reinterpret_cast(vertex_positions + ri * vertex_stride_float);
+ unsigned int x = key[0], y = key[1], z = key[2];
+
+ // replace negative zero with zero
+ x = (x == 0x80000000) ? 0 : x;
+ y = (y == 0x80000000) ? 0 : y;
+ z = (z == 0x80000000) ? 0 : z;
+
// scramble bits to make sure that integer coordinates have entropy in lower bits
- unsigned int x = key[0] ^ (key[0] >> 17);
- unsigned int y = key[1] ^ (key[1] >> 17);
- unsigned int z = key[2] ^ (key[2] >> 17);
+ x ^= x >> 17;
+ y ^= y >> 17;
+ z ^= z >> 17;
// Optimized Spatial Hashing for Collision Detection of Deformable Objects
return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
@@ -132,7 +140,10 @@ struct PositionHasher
unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs;
unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs;
- return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0;
+ const float* lv = vertex_positions + li * vertex_stride_float;
+ const float* rv = vertex_positions + ri * vertex_stride_float;
+
+ return lv[0] == rv[0] && lv[1] == rv[1] && lv[2] == rv[2];
}
};
@@ -208,6 +219,11 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
remap[index] = *entry;
}
+ allocator.deallocate(table);
+
+ if (!wedge)
+ return;
+
// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
for (size_t i = 0; i < vertex_count; ++i)
@@ -221,22 +237,24 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
wedge[i] = wedge[r];
wedge[r] = unsigned(i);
}
-
- allocator.deallocate(table);
}
static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator)
{
// use a bit set to compute the precise number of unique vertices
unsigned char* filter = allocator.allocate((vertex_count + 7) / 8);
- memset(filter, 0, (vertex_count + 7) / 8);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+ filter[index / 8] = 0;
+ }
size_t unique = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
- assert(index < vertex_count);
-
unique += (filter[index / 8] & (1 << (index % 8))) == 0;
filter[index / 8] |= 1 << (index % 8);
}
@@ -255,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
-
unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
if (*entry == ~0u)
@@ -288,14 +305,14 @@ enum VertexKind
};
// manifold vertices can collapse onto anything
-// border/seam vertices can only be collapsed onto border/seam respectively
+// border/seam vertices can collapse onto border/seam respectively, or locked
// complex vertices can collapse onto complex/locked
// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
{1, 1, 1, 1, 1},
- {0, 1, 0, 0, 0},
- {0, 0, 1, 0, 0},
+ {0, 1, 0, 0, 1},
+ {0, 0, 1, 0, 1},
{0, 0, 0, 1, 1},
{0, 0, 0, 0, 0},
};
@@ -303,11 +320,13 @@ const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
// note that for seam edges, the opposite edge isn't present in the attribute-based topology
// but is present if you consider a position-only mesh variant
+// while many complex collapses have the opposite edge, since complex vertices collapse to the
+// same wedge, keeping opposite edges separate improves the quality by considering both targets
const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
- {1, 1, 1, 0, 1},
+ {1, 1, 1, 1, 1},
{1, 0, 1, 0, 0},
{1, 1, 1, 0, 1},
- {0, 0, 0, 0, 0},
+ {1, 0, 0, 0, 0},
{1, 0, 1, 0, 0},
};
@@ -323,14 +342,33 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int
return false;
}
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b, const unsigned int* remap, const unsigned int* wedge)
+{
+ unsigned int v = a;
+
+ do
+ {
+ unsigned int count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+ const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[v];
+
+ for (size_t i = 0; i < count; ++i)
+ if (remap[edges[i].next] == remap[b])
+ return true;
+
+ v = wedge[v];
+ } while (v != a);
+
+ return false;
+}
+
static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options)
{
memset(loop, -1, vertex_count * sizeof(unsigned int));
memset(loopback, -1, vertex_count * sizeof(unsigned int));
// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
- // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
- // but here it's okay to fill the data out for other types of vertices as well
+ // note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default
+ // in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices
unsigned int* openinc = loopback;
unsigned int* openout = loop;
@@ -369,12 +407,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
{
if (remap[i] == i)
{
- if (vertex_lock && vertex_lock[sparse_remap ? sparse_remap[i] : i])
- {
- // vertex is explicitly locked
- result[i] = Kind_Locked;
- }
- else if (wedge[i] == i)
+ if (wedge[i] == i)
{
// no attribute seam, need to check if it's manifold
unsigned int openi = openinc[i], openo = openout[i];
@@ -386,6 +419,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
{
result[i] = Kind_Manifold;
}
+ else if (openi != ~0u && openo != ~0u && remap[openi] == remap[openo] && openi != i)
+ {
+ // classify half-seams as seams (the branch below would mis-classify them as borders)
+ // half-seam is a single vertex that connects to both vertices of a potential seam
+ // treating these as seams allows collapsing the "full" seam vertex onto them
+ result[i] = Kind_Seam;
+ }
else if (openi != i && openo != i)
{
result[i] = Kind_Border;
@@ -407,7 +447,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
openiw != ~0u && openiw != w && openow != ~0u && openow != w)
{
- if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+ if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw] && remap[openiv] != remap[openov])
{
result[i] = Kind_Seam;
}
@@ -438,6 +478,58 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
}
}
+ if (options & meshopt_SimplifyPermissive)
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (result[i] == Kind_Seam || result[i] == Kind_Locked)
+ {
+ if (remap[i] != i)
+ {
+ // only process primary vertices; wedges will be updated to match the primary vertex
+ result[i] = result[remap[i]];
+ continue;
+ }
+
+ bool protect = false;
+
+ // vertex_lock may protect any wedge, not just the primary vertex, so we switch to complex only if no wedges are protected
+ unsigned int v = unsigned(i);
+ do
+ {
+ unsigned int rv = sparse_remap ? sparse_remap[v] : v;
+ protect |= vertex_lock && (vertex_lock[rv] & meshopt_SimplifyVertex_Protect) != 0;
+ v = wedge[v];
+ } while (v != i);
+
+ // protect if any adjoining edge doesn't have an opposite edge (indicating vertex is on the border)
+ do
+ {
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[v]];
+ size_t count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+
+ for (size_t j = 0; j < count; ++j)
+ protect |= !hasEdge(adjacency, edges[j].next, v, remap, wedge);
+ v = wedge[v];
+ } while (v != i);
+
+ result[i] = protect ? result[i] : int(Kind_Complex);
+ }
+
+ if (vertex_lock)
+ {
+ // vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+ if (vertex_lock[ri] & meshopt_SimplifyVertex_Lock)
+ result[remap[i]] = Kind_Locked;
+ }
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (result[remap[i]] == Kind_Locked)
+ result[i] = Kind_Locked;
+ }
+
if (options & meshopt_SimplifyLockBorder)
for (size_t i = 0; i < vertex_count; ++i)
if (result[i] == Kind_Border)
@@ -454,7 +546,7 @@ struct Vector3
float x, y, z;
};
-static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL)
+static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL, float* out_offset = NULL)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
@@ -500,10 +592,17 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat
}
}
+ if (out_offset)
+ {
+ out_offset[0] = minv[0];
+ out_offset[1] = minv[1];
+ out_offset[2] = minv[2];
+ }
+
return extent;
}
-static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* sparse_remap)
+static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* attribute_remap, const unsigned int* sparse_remap)
{
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
@@ -513,18 +612,61 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
for (size_t k = 0; k < attribute_count; ++k)
{
- float a = vertex_attributes_data[ri * vertex_attributes_stride_float + k];
+ unsigned int rk = attribute_remap[k];
+ float a = vertex_attributes_data[ri * vertex_attributes_stride_float + rk];
- result[i * attribute_count + k] = a * attribute_weights[k];
+ result[i * attribute_count + k] = a * attribute_weights[rk];
}
}
}
-static const size_t kMaxAttributes = 16;
+static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
+{
+ size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
+ size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+ // updating externally locked vertices is not allowed
+ if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
+ continue;
+
+ // moving locked vertices may result in floating point drift
+ if (vertex_kind[i] != Kind_Locked)
+ {
+ const Vector3& p = vertex_positions[i];
+ float* v = vertex_positions_data + ri * vertex_positions_stride_float;
+
+ v[0] = p.x * vertex_scale + vertex_offset[0];
+ v[1] = p.y * vertex_scale + vertex_offset[1];
+ v[2] = p.z * vertex_scale + vertex_offset[2];
+ }
+
+ if (attribute_count)
+ {
+ const float* sa = vertex_attributes + i * attribute_count;
+ float* va = vertex_attributes_data + ri * vertex_attributes_stride_float;
+
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ unsigned int rk = attribute_remap[k];
+
+ va[rk] = sa[k] / attribute_weights[rk];
+ }
+ }
+ }
+}
+
+static const size_t kMaxAttributes = 32;
struct Quadric
{
- // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c
+ // a00*x^2 + a11*y^2 + a22*z^2 + 2*a10*xy + 2*a20*xz + 2*a21*yz + 2*b0*x + 2*b1*y + 2*b2*z + c
float a00, a11, a22;
float a10, a20, a21;
float b0, b1, b2, c;
@@ -586,6 +728,14 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
Q.w += R.w;
}
+static void quadricAdd(QuadricGrad& G, const QuadricGrad& R)
+{
+ G.gx += R.gx;
+ G.gy += R.gy;
+ G.gz += R.gz;
+ G.gw += R.gw;
+}
+
static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count)
{
for (size_t k = 0; k < attribute_count; ++k)
@@ -597,7 +747,7 @@ static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_co
}
}
-static float quadricError(const Quadric& Q, const Vector3& v)
+static float quadricEval(const Quadric& Q, const Vector3& v)
{
float rx = Q.b0;
float ry = Q.b1;
@@ -620,6 +770,12 @@ static float quadricError(const Quadric& Q, const Vector3& v)
r += ry * v.y;
r += rz * v.z;
+ return r;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+ float r = quadricEval(Q, v);
float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
return fabsf(r) * s;
@@ -627,26 +783,7 @@ static float quadricError(const Quadric& Q, const Vector3& v)
static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va)
{
- float rx = Q.b0;
- float ry = Q.b1;
- float rz = Q.b2;
-
- rx += Q.a10 * v.y;
- ry += Q.a21 * v.z;
- rz += Q.a20 * v.x;
-
- rx *= 2;
- ry *= 2;
- rz *= 2;
-
- rx += Q.a00 * v.x;
- ry += Q.a11 * v.y;
- rz += Q.a22 * v.z;
-
- float r = Q.c;
- r += rx * v.x;
- r += ry * v.y;
- r += rz * v.z;
+ float r = quadricEval(Q, v);
// see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
for (size_t k = 0; k < attribute_count; ++k)
@@ -654,14 +791,11 @@ static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribu
float a = va[k];
float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw;
- r += a * a * Q.w;
- r -= 2 * a * g;
+ r += a * (a * Q.w - 2 * g);
}
- // TODO: weight normalization is breaking attribute error somehow
- float s = 1; // Q.w == 0.f ? 0.f : 1.f / Q.w;
-
- return fabsf(r) * s;
+ // note: unlike position error, we do not normalize by Q.w to retain edge scaling as described in quadricFromAttributes
+ return fabsf(r);
}
static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
@@ -684,6 +818,17 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
Q.w = w;
}
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+ Q.a00 = Q.a11 = Q.a22 = w;
+ Q.a10 = Q.a20 = Q.a21 = 0;
+ Q.b0 = -x * w;
+ Q.b1 = -y * w;
+ Q.b2 = -z * w;
+ Q.c = (x * x + y * y + z * z) * w;
+ Q.w = w;
+}
+
static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
{
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
@@ -702,20 +847,24 @@ static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1
static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
{
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
- float length = normalize(p10);
- // p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+ // edge length; keep squared length around for projection correction
+ float lengthsq = p10.x * p10.x + p10.y * p10.y + p10.z * p10.z;
+ float length = sqrtf(lengthsq);
+
+ // p20p = length of projection of p2-p0 onto p1-p0; note that p10 is unnormalized so we need to correct it later
Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
- // normal = altitude of triangle from point p2 onto edge p1-p0
- Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
- normalize(normal);
+ // perp = perpendicular vector from p2 to line segment p1-p0
+ // note: since p10 is unnormalized we need to correct the projection; we scale p20 instead to take advantage of normalize below
+ Vector3 perp = {p20.x * lengthsq - p10.x * p20p, p20.y * lengthsq - p10.y * p20p, p20.z * lengthsq - p10.z * p20p};
+ normalize(perp);
- float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+ float distance = perp.x * p0.x + perp.y * p0.y + perp.z * p0.z;
// note: the weight is scaled linearly with edge length; this has to match the triangle weight
- quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+ quadricFromPlane(Q, perp.x, perp.y, perp.z, -distance, length * weight);
}
static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count)
@@ -728,16 +877,21 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
- // weight is scaled linearly with edge length
+ // normal = cross(p1 - p0, p2 - p0)
Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
- float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
- float w = sqrtf(area); // TODO this needs more experimentation
+ float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z) * 0.5f;
+
+ // quadric is weighted with the square of edge length (= area)
+ // this equalizes the units with the positional error (which, after normalization, is a square of distance)
+ // as a result, a change in weighted attribute of 1 along distance d is approximately equivalent to a change in position of d
+ float w = area;
// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
// v = (d11 * d20 - d01 * d21) / denom
// w = (d00 * d21 - d01 * d20) / denom
// u = 1 - v - w
// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+ // note: v2 and d20/d21 can not be evaluated here as v2 is effectively an unknown variable; we need these only as variables for derivation of gradients
const Vector3& v0 = p10;
const Vector3& v1 = p20;
float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
@@ -747,7 +901,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
float denomr = denom == 0 ? 0.f : 1.f / denom;
// precompute gradient factors
- // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+ // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out expressions that are shared between attributes
float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
@@ -772,6 +926,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+ // note: for simplicity we scale all factors by weight here instead of outside the loop
Q.a00 += w * (gx * gx);
Q.a11 += w * (gy * gy);
Q.a22 += w * (gz * gz);
@@ -794,7 +949,112 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
}
}
-static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+static void quadricVolumeGradient(QuadricGrad& G, const Vector3& p0, const Vector3& p1, const Vector3& p2)
+{
+ Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+ Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+ // normal = cross(p1 - p0, p2 - p0)
+ Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+ float area = normalize(normal) * 0.5f;
+
+ G.gx = normal.x * area;
+ G.gy = normal.y * area;
+ G.gz = normal.z * area;
+ G.gw = (-p0.x * normal.x - p0.y * normal.y - p0.z * normal.z) * area;
+}
+
+static bool quadricSolve(Vector3& p, const Quadric& Q, const QuadricGrad& GV)
+{
+ // solve A*p = -b where A is the quadric matrix and b is the linear term
+ float a00 = Q.a00, a11 = Q.a11, a22 = Q.a22;
+ float a10 = Q.a10, a20 = Q.a20, a21 = Q.a21;
+ float x0 = -Q.b0, x1 = -Q.b1, x2 = -Q.b2;
+
+ float eps = 1e-6f * Q.w;
+
+ // LDL decomposition: A = LDL^T
+ float d0 = a00;
+ float l10 = a10 / d0;
+ float l20 = a20 / d0;
+
+ float d1 = a11 - a10 * l10;
+ float dl21 = a21 - a20 * l10;
+ float l21 = dl21 / d1;
+
+ float d2 = a22 - a20 * l20 - dl21 * l21;
+
+ // solve L*y = x
+ float y0 = x0;
+ float y1 = x1 - l10 * y0;
+ float y2 = x2 - l20 * y0 - l21 * y1;
+
+ // solve D*z = y
+ float z0 = y0 / d0;
+ float z1 = y1 / d1;
+ float z2 = y2 / d2;
+
+ // augment system with linear constraint GV using Lagrange multiplier
+ float a30 = GV.gx, a31 = GV.gy, a32 = GV.gz;
+ float x3 = -GV.gw;
+
+ float l30 = a30 / d0;
+ float dl31 = a31 - a30 * l10;
+ float l31 = dl31 / d1;
+ float dl32 = a32 - a30 * l20 - dl31 * l21;
+ float l32 = dl32 / d2;
+ float d3 = 0.f - a30 * l30 - dl31 * l31 - dl32 * l32;
+
+ float y3 = x3 - l30 * y0 - l31 * y1 - l32 * y2;
+ float z3 = fabsf(d3) > eps ? y3 / d3 : 0.f; // if d3 is zero, we can ignore the constraint
+
+ // substitute L^T*p = z
+ float lambda = z3;
+ float pz = z2 - l32 * lambda;
+ float py = z1 - l21 * pz - l31 * lambda;
+ float px = z0 - l10 * py - l20 * pz - l30 * lambda;
+
+ p.x = px;
+ p.y = py;
+ p.z = pz;
+
+ return fabsf(d0) > eps && fabsf(d1) > eps && fabsf(d2) > eps;
+}
+
+static void quadricReduceAttributes(Quadric& Q, const Quadric& A, const QuadricGrad* G, size_t attribute_count)
+{
+ // update vertex quadric with attribute quadric; multiply by vertex weight to minimize normalized error
+ Q.a00 += A.a00 * Q.w;
+ Q.a11 += A.a11 * Q.w;
+ Q.a22 += A.a22 * Q.w;
+ Q.a10 += A.a10 * Q.w;
+ Q.a20 += A.a20 * Q.w;
+ Q.a21 += A.a21 * Q.w;
+ Q.b0 += A.b0 * Q.w;
+ Q.b1 += A.b1 * Q.w;
+ Q.b2 += A.b2 * Q.w;
+
+ float iaw = A.w == 0 ? 0.f : Q.w / A.w;
+
+ // update linear system based on attribute gradients (BB^T/a)
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ const QuadricGrad& g = G[k];
+
+ Q.a00 -= (g.gx * g.gx) * iaw;
+ Q.a11 -= (g.gy * g.gy) * iaw;
+ Q.a22 -= (g.gz * g.gz) * iaw;
+ Q.a10 -= (g.gx * g.gy) * iaw;
+ Q.a20 -= (g.gx * g.gz) * iaw;
+ Q.a21 -= (g.gy * g.gz) * iaw;
+
+ Q.b0 -= (g.gx * g.gw) * iaw;
+ Q.b1 -= (g.gy * g.gw) * iaw;
+ Q.b2 -= (g.gz * g.gw) * iaw;
+ }
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, QuadricGrad* volume_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
{
for (size_t i = 0; i < index_count; i += 3)
{
@@ -808,6 +1068,36 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
quadricAdd(vertex_quadrics[remap[i0]], Q);
quadricAdd(vertex_quadrics[remap[i1]], Q);
quadricAdd(vertex_quadrics[remap[i2]], Q);
+
+ if (volume_gradients)
+ {
+ QuadricGrad GV;
+ quadricVolumeGradient(GV, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2]);
+
+ quadricAdd(volume_gradients[remap[i0]], GV);
+ quadricAdd(volume_gradients[remap[i1]], GV);
+ quadricAdd(volume_gradients[remap[i2]], GV);
+ }
+ }
+}
+
+static void fillVertexQuadrics(Quadric* vertex_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* remap, unsigned int options)
+{
+ // by default, we use a very small weight to improve triangulation and numerical stability without affecting the shape or error
+ float factor = (options & meshopt_SimplifyRegularize) ? 1e-1f : 1e-7f;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] != i)
+ continue;
+
+ const Vector3& p = vertex_positions[i];
+ float w = vertex_quadrics[i].w * factor;
+
+ Quadric Q;
+ quadricFromPoint(Q, p.x, p.y, p.z, w);
+
+ quadricAdd(vertex_quadrics[i], Q);
}
}
@@ -837,15 +1127,11 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
continue;
- // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
- if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
- continue;
-
unsigned int i2 = indices[i + next[e + 1]];
// we try hard to maintain border edge geometry; seam edges can move more freely
// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
- const float kEdgeWeightSeam = 1.f;
+ const float kEdgeWeightSeam = 0.5f; // applied twice due to opposite edges
const float kEdgeWeightBorder = 10.f;
float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
@@ -853,13 +1139,20 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
Quadric Q;
quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+ Quadric QT;
+ quadricFromTriangle(QT, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+ // mix edge quadric with triangle quadric to stabilize collapses in both directions; both quadrics inherit edge weight so that their error is added
+ QT.w = 0;
+ quadricAdd(Q, QT);
+
quadricAdd(vertex_quadrics[remap[i0]], Q);
quadricAdd(vertex_quadrics[remap[i1]], Q);
}
}
}
-static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap)
+static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count)
{
for (size_t i = 0; i < index_count; i += 3)
{
@@ -871,14 +1164,13 @@ static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attr
QuadricGrad G[kMaxAttributes];
quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count);
- // TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea
- quadricAdd(attribute_quadrics[remap[i0]], QA);
- quadricAdd(attribute_quadrics[remap[i1]], QA);
- quadricAdd(attribute_quadrics[remap[i2]], QA);
+ quadricAdd(attribute_quadrics[i0], QA);
+ quadricAdd(attribute_quadrics[i1], QA);
+ quadricAdd(attribute_quadrics[i2], QA);
- quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count);
- quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count);
- quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i0 * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i1 * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i2 * attribute_count], G, attribute_count);
}
}
@@ -922,6 +1214,30 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
continue;
// early-out when at least one triangle flips due to a collapse
+ if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
+ {
+#if TRACE >= 2
+ printf("edge block %d -> %d: flip welded %d %d %d\n", i0, i1, a, i0, b);
+#endif
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0, const Vector3& v1)
+{
+ const Vector3& v0 = vertex_positions[i0];
+
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+ size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int a = edges[i].next, b = edges[i].prev;
+
if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
return true;
}
@@ -929,6 +1245,46 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
return false;
}
+static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0)
+{
+ const Vector3& v0 = vertex_positions[i0];
+
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+ size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+ float result = 0.f;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int a = edges[i].next, b = edges[i].prev;
+
+ const Vector3& va = vertex_positions[a];
+ const Vector3& vb = vertex_positions[b];
+
+ float da = (va.x - v0.x) * (va.x - v0.x) + (va.y - v0.y) * (va.y - v0.y) + (va.z - v0.z) * (va.z - v0.z);
+ float db = (vb.x - v0.x) * (vb.x - v0.x) + (vb.y - v0.y) * (vb.y - v0.y) + (vb.z - v0.z) * (vb.z - v0.z);
+
+ result = result < da ? da : result;
+ result = result < db ? db : result;
+ }
+
+ return sqrtf(result);
+}
+
+static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback)
+{
+ unsigned int r = remap[target];
+
+ // use loop metadata to guide complex collapses towards the correct wedge
+ // this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams
+ if (loop[v] != ~0u && remap[loop[v]] == r)
+ return loop[v];
+ else if (loopback[v] != ~0u && remap[loopback[v]] == r)
+ return loopback[v];
+ else
+ return target;
+}
+
static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
{
size_t dual_count = 0;
@@ -947,7 +1303,7 @@ static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_c
return (index_count - dual_count / 2) + 3;
}
-static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
{
size_t collapse_count = 0;
@@ -983,8 +1339,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
// two vertices are on a border or a seam, but there's no direct edge between them
// this indicates that they belong to two different edge loops and we should not collapse this edge
- // loop[] tracks half edges so we only need to check i0->i1
- if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+ // loop[] and loopback[] track half edges so we only need to check one of them
+ if ((k0 == Kind_Border || k0 == Kind_Seam) && k1 != Kind_Manifold && loop[i0] != i1)
+ continue;
+ if ((k1 == Kind_Border || k1 == Kind_Seam) && k0 != Kind_Manifold && loopback[i1] != i0)
continue;
// edge can be collapsed in either direction - we will pick the one with minimum error
@@ -1009,7 +1367,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
return collapse_count;
}
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
{
for (size_t i = 0; i < collapse_count; ++i)
{
@@ -1017,40 +1375,94 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
unsigned int i0 = c.v0;
unsigned int i1 = c.v1;
-
- // most edges are bidirectional which means we need to evaluate errors for two collapses
- // to keep this code branchless we just use the same edge for unidirectional edges
- unsigned int j0 = c.bidi ? i1 : i0;
- unsigned int j1 = c.bidi ? i0 : i1;
+ bool bidi = c.bidi;
float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
- float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+ float ej = bidi ? quadricError(vertex_quadrics[remap[i1]], vertex_positions[i0]) : FLT_MAX;
+
+#if TRACE >= 3
+ float di = ei, dj = ej;
+#endif
if (attribute_count)
{
- ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
- ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+ ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
+ ej += bidi ? quadricError(attribute_quadrics[i1], &attribute_gradients[i1 * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]) : 0;
+
+ // seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate
+ if (vertex_kind[i0] == Kind_Seam)
+ {
+ // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+ unsigned int s0 = wedge[i0];
+ unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+
+ assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+ assert(s1 != ~0u && remap[s1] == remap[i1]);
+
+ // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+ s1 = (s1 != ~0u) ? s1 : wedge[i1];
+
+ ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]);
+ ej += bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0;
+ }
+ else
+ {
+ // complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target
+ if (vertex_kind[i0] == Kind_Complex)
+ for (unsigned int v = wedge[i0]; v != i0; v = wedge[v])
+ {
+ unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+ ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+ }
+
+ if (vertex_kind[i1] == Kind_Complex && bidi)
+ for (unsigned int v = wedge[i1]; v != i1; v = wedge[v])
+ {
+ unsigned int t = getComplexTarget(v, i0, remap, loop, loopback);
+
+ ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+ }
+ }
}
- // pick edge direction with minimal error
- c.v0 = ei <= ej ? i0 : j0;
- c.v1 = ei <= ej ? i1 : j1;
- c.error = ei <= ej ? ei : ej;
+ // pick edge direction with minimal error (branchless)
+ bool rev = bidi & (ej < ei);
+
+ c.v0 = rev ? i1 : i0;
+ c.v1 = rev ? i0 : i1;
+ c.error = ej < ei ? ej : ei;
+
+#if TRACE >= 3
+ if (bidi)
+ printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n",
+ rev ? i1 : i0, rev ? i0 : i1,
+ sqrtf(rev ? ej : ei), sqrtf(rev ? dj : di), sqrtf(rev ? ej - dj : ei - di),
+ sqrtf(rev ? ei : ej), sqrtf(rev ? di : dj), sqrtf(rev ? ei - di : ej - dj));
+ else
+ printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", i0, i1, sqrtf(c.error), sqrtf(di), sqrtf(ei - di));
+#endif
}
}
static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
{
- const int sort_bits = 11;
+ // we use counting sort to order collapses by error; since the exact sort order is not as critical,
+ // only top 12 bits of exponent+mantissa (8 bits of exponent and 4 bits of mantissa) are used.
+ // to avoid excessive stack usage, we clamp the exponent range as collapses with errors much higher than 1 are not useful.
+ const unsigned int sort_bits = 12;
+ const unsigned int sort_bins = 2048 + 512; // exponent range [-127, 32)
// fill histogram for counting sort
- unsigned int histogram[1 << sort_bits];
+ unsigned int histogram[sort_bins];
memset(histogram, 0, sizeof(histogram));
for (size_t i = 0; i < collapse_count; ++i)
{
// skip sign bit since error is non-negative
- unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+ unsigned int error = collapses[i].errorui;
+ unsigned int key = (error << 1) >> (32 - sort_bits);
+ key = key < sort_bins ? key : sort_bins - 1;
histogram[key]++;
}
@@ -1058,7 +1470,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
// compute offsets based on histogram data
size_t histogram_sum = 0;
- for (size_t i = 0; i < 1 << sort_bits; ++i)
+ for (size_t i = 0; i < sort_bins; ++i)
{
size_t count = histogram[i];
histogram[i] = unsigned(histogram_sum);
@@ -1071,13 +1483,15 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
for (size_t i = 0; i < collapse_count; ++i)
{
// skip sign bit since error is non-negative
- unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+ unsigned int error = collapses[i].errorui;
+ unsigned int key = (error << 1) >> (32 - sort_bits);
+ key = key < sort_bins ? key : sort_bins - 1;
sort_order[histogram[key]++] = unsigned(i);
}
}
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
{
size_t edge_collapses = 0;
size_t triangle_collapses = 0;
@@ -1087,7 +1501,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
size_t edge_collapse_goal = triangle_collapse_goal / 2;
#if TRACE
- size_t stats[4] = {};
+ size_t stats[7] = {};
#endif
for (size_t i = 0; i < collapse_count; ++i)
@@ -1097,10 +1511,16 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
TRACESTATS(0);
if (c.error > error_limit)
+ {
+ TRACESTATS(4);
break;
+ }
if (triangle_collapses >= triangle_collapse_goal)
+ {
+ TRACESTATS(5);
break;
+ }
// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
// as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor
@@ -1108,8 +1528,11 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
// on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd
// topology, we only abort if we got over 1/6 collapses accordingly.
- if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 6)
+ if (c.error > error_goal && c.error > result_error && triangle_collapses > triangle_collapse_goal / 6)
+ {
+ TRACESTATS(6);
break;
+ }
unsigned int i0 = c.v0;
unsigned int i1 = c.v1;
@@ -1117,6 +1540,8 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
unsigned int r0 = remap[i0];
unsigned int r1 = remap[i1];
+ unsigned char kind = vertex_kind[i0];
+
// we don't collapse vertices that had source or target vertex involved in a collapse
// it's important to not move the vertices twice since it complicates the tracking/remapping logic
// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
@@ -1135,35 +1560,41 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
continue;
}
+#if TRACE >= 2
+ printf("edge commit %d -> %d: kind %d->%d, error %f\n", i0, i1, vertex_kind[i0], vertex_kind[i1], sqrtf(c.error));
+#endif
+
assert(collapse_remap[r0] == r0);
assert(collapse_remap[r1] == r1);
- quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
-
- if (attribute_count)
- {
- quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]);
- quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count);
- }
-
- if (vertex_kind[i0] == Kind_Complex)
+ if (kind == Kind_Complex)
{
+ // remap all vertices in the complex to the target vertex
unsigned int v = i0;
do
{
- collapse_remap[v] = r1;
+ unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+ collapse_remap[v] = t;
v = wedge[v];
} while (v != i0);
}
- else if (vertex_kind[i0] == Kind_Seam)
+ else if (kind == Kind_Seam)
{
- // remap v0 to v1 and seam pair of v0 to seam pair of v1
+ // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
unsigned int s0 = wedge[i0];
- unsigned int s1 = wedge[i1];
+ unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+ assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+ assert(s1 != ~0u && remap[s1] == r1);
- assert(s0 != i0 && s1 != i1);
- assert(wedge[s0] == i0 && wedge[s1] == i1);
+ // additional asserts to verify that the seam pair is consistent
+ assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
+ assert(loop[i0] == i1 || loopback[i0] == i1);
+ assert(loop[s0] == s1 || loopback[s0] == s1);
+
+ // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+ s1 = (s1 != ~0u) ? s1 : wedge[i1];
collapse_remap[i0] = i1;
collapse_remap[s0] = s1;
@@ -1175,28 +1606,205 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
collapse_remap[i0] = i1;
}
+ // note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used
+ // however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling
collapse_locked[r0] = 1;
collapse_locked[r1] = 1;
// border edges collapse 1 triangle, other edges collapse 2 or more
- triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+ triangle_collapses += (kind == Kind_Border) ? 1 : 2;
edge_collapses++;
result_error = result_error < c.error ? c.error : result_error;
}
#if TRACE
- float error_goal_perfect = edge_collapse_goal < collapse_count ? collapses[collapse_order[edge_collapse_goal]].error : 0.f;
+ float error_goal_last = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX;
+ float error_goal_limit = error_goal_last < error_limit ? error_goal_last : error_limit;
- printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d)\n",
- int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_perfect),
- int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]));
+ printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d); %s\n",
+ int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_limit),
+ int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]),
+ stats[4] ? "error limit" : (stats[5] ? "count limit" : (stats[6] ? "error goal" : "out of collapses")));
#endif
return edge_collapses;
}
-static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, QuadricGrad* volume_gradients, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (collapse_remap[i] == i)
+ continue;
+
+ unsigned int i0 = unsigned(i);
+ unsigned int i1 = collapse_remap[i];
+
+ unsigned int r0 = remap[i0];
+ unsigned int r1 = remap[i1];
+
+ // ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved
+ if (i0 == r0)
+ {
+ quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+ if (volume_gradients)
+ quadricAdd(volume_gradients[r1], volume_gradients[r0]);
+ }
+
+ if (attribute_count)
+ {
+ quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
+ quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
+
+ if (i0 == r0)
+ {
+ // when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment
+ float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]);
+ vertex_error = vertex_error < derr ? derr : vertex_error;
+ }
+ }
+ }
+}
+
+static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+#if TRACE
+ size_t stats[6] = {};
+#endif
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
+ // moving vertices on a border requires a stronger edge quadric to preserve the border geometry
+ if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
+ continue;
+
+ if (remap[i] != i)
+ {
+ vertex_positions[i] = vertex_positions[remap[i]];
+ continue;
+ }
+
+ TRACESTATS(0);
+
+ const Vector3& vp = vertex_positions[i];
+
+ Quadric Q = vertex_quadrics[i];
+ QuadricGrad GV = {};
+
+ // add a point quadric for regularization to stabilize the solution
+ Quadric R;
+ quadricFromPoint(R, vp.x, vp.y, vp.z, Q.w * 1e-4f);
+ quadricAdd(Q, R);
+
+ if (attribute_count)
+ {
+ // optimal point simultaneously minimizes attribute quadrics for all wedges
+ unsigned int v = unsigned(i);
+ do
+ {
+ quadricReduceAttributes(Q, attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count);
+ v = wedge[v];
+ } while (v != i);
+
+ // minimizing attribute quadrics results in volume loss so we incorporate volume gradient as a constraint
+ if (volume_gradients)
+ GV = volume_gradients[i];
+ }
+
+ Vector3 p;
+ if (!quadricSolve(p, Q, GV))
+ {
+ TRACESTATS(2);
+ continue;
+ }
+
+ // reject updates that move the vertex too far from its neighborhood
+ // this detects and fixes most cases when the quadric is not well-defined
+ float nr = getNeighborhoodRadius(adjacency, vertex_positions, unsigned(i));
+ float dp = (p.x - vp.x) * (p.x - vp.x) + (p.y - vp.y) * (p.y - vp.y) + (p.z - vp.z) * (p.z - vp.z);
+
+ if (dp > nr * nr)
+ {
+ TRACESTATS(3);
+ continue;
+ }
+
+ // reject updates that would flip a neighboring triangle, as we do for edge collapse
+ if (hasTriangleFlips(adjacency, vertex_positions, unsigned(i), p))
+ {
+ TRACESTATS(4);
+ continue;
+ }
+
+ // reject updates that increase positional error too much; allow some tolerance to improve attribute quality
+ if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
+ {
+ TRACESTATS(5);
+ continue;
+ }
+
+ TRACESTATS(1);
+ vertex_positions[i] = p;
+ }
+
+#if TRACE
+ printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
+#endif
+}
+
+static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ if (remap[i] != i)
+ continue;
+
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ unsigned int shared = ~0u;
+
+ // for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
+ if (vertex_kind[i] == Kind_Complex)
+ {
+ shared = unsigned(i);
+
+ for (unsigned int v = wedge[i]; v != i; v = wedge[v])
+ if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
+ shared = ~0u;
+ else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
+ shared = v;
+ }
+
+ // update attributes for all wedges
+ unsigned int v = unsigned(i);
+ do
+ {
+ unsigned int r = (shared == ~0u) ? v : shared;
+
+ const Vector3& p = vertex_positions[i]; // same for all wedges
+ const Quadric& A = attribute_quadrics[r];
+ const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
+
+ float iw = A.w == 0 ? 0.f : 1.f / A.w;
+ float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
+
+ vertex_attributes[v * attribute_count + k] = av;
+ v = wedge[v];
+ } while (v != i);
+ }
+ }
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap, const unsigned int* remap)
{
size_t write = 0;
@@ -1211,7 +1819,14 @@ static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const
assert(collapse_remap[v1] == v1);
assert(collapse_remap[v2] == v2);
- if (v0 != v1 && v0 != v2 && v1 != v2)
+ // collapse zero area triangles even if they are not topologically degenerate
+ // this is required to cleanup manifold->seam collapses when a vertex is collapsed onto a seam pair
+ // as well as complex collapses and some other cases where cross wedge collapses are performed
+ unsigned int r0 = remap[v0];
+ unsigned int r1 = remap[v1];
+ unsigned int r2 = remap[v2];
+
+ if (r0 != r1 && r0 != r2 && r1 != r2)
{
indices[write + 0] = v0;
indices[write + 1] = v1;
@@ -1227,17 +1842,183 @@ static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsign
{
for (size_t i = 0; i < vertex_count; ++i)
{
+ // note: this is a no-op for vertices that were remapped
+ // ideally we would clear the loop entries for those for consistency, even though they aren't going to be used
+ // however, the remapping process needs loop information for remapped vertices, so this would require a separate pass
if (loop[i] != ~0u)
{
unsigned int l = loop[i];
unsigned int r = collapse_remap[l];
// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
- loop[i] = (i == r) ? loop[l] : r;
+ if (i == r)
+ loop[i] = (loop[l] != ~0u) ? collapse_remap[loop[l]] : ~0u;
+ else
+ loop[i] = r;
}
}
}
+static unsigned int follow(unsigned int* parents, unsigned int index)
+{
+ while (index != parents[index])
+ {
+ unsigned int parent = parents[index];
+ parents[index] = parents[parent];
+ index = parent;
+ }
+
+ return index;
+}
+
+static size_t buildComponents(unsigned int* components, size_t vertex_count, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ components[i] = unsigned(i);
+
+ // compute a unique (but not sequential!) index for each component via union-find
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ static const int next[4] = {1, 2, 0, 1};
+
+ for (int e = 0; e < 3; ++e)
+ {
+ unsigned int i0 = indices[i + e];
+ unsigned int i1 = indices[i + next[e]];
+
+ unsigned int r0 = remap[i0];
+ unsigned int r1 = remap[i1];
+
+ r0 = follow(components, r0);
+ r1 = follow(components, r1);
+
+ // merge components with larger indices into components with smaller indices
+ // this guarantees that the root of the component is always the one with the smallest index
+ if (r0 != r1)
+ components[r0 < r1 ? r1 : r0] = r0 < r1 ? r0 : r1;
+ }
+ }
+
+ // make sure each element points to the component root *before* we renumber the components
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (remap[i] == i)
+ components[i] = follow(components, unsigned(i));
+
+ unsigned int next_component = 0;
+
+ // renumber components using sequential indices
+ // a sequential pass is sufficient because component root always has the smallest index
+ // note: it is unsafe to use follow() in this pass because we're replacing component links with sequential indices inplace
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] == i)
+ {
+ unsigned int root = components[i];
+ assert(root <= i); // make sure we already computed the component for non-roots
+ components[i] = (root == i) ? next_component++ : components[root];
+ }
+ else
+ {
+ assert(remap[i] < i); // make sure we already computed the component
+ components[i] = components[remap[i]];
+ }
+ }
+
+ return next_component;
+}
+
+static void measureComponents(float* component_errors, size_t component_count, const unsigned int* components, const Vector3* vertex_positions, size_t vertex_count)
+{
+ memset(component_errors, 0, component_count * 4 * sizeof(float));
+
+ // compute approximate sphere center for each component as an average
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int c = components[i];
+ assert(components[i] < component_count);
+
+ Vector3 v = vertex_positions[i]; // copy avoids aliasing issues
+
+ component_errors[c * 4 + 0] += v.x;
+ component_errors[c * 4 + 1] += v.y;
+ component_errors[c * 4 + 2] += v.z;
+ component_errors[c * 4 + 3] += 1; // weight
+ }
+
+ // complete the center computation, and reinitialize [3] as a radius
+ for (size_t i = 0; i < component_count; ++i)
+ {
+ float w = component_errors[i * 4 + 3];
+ float iw = w == 0.f ? 0.f : 1.f / w;
+
+ component_errors[i * 4 + 0] *= iw;
+ component_errors[i * 4 + 1] *= iw;
+ component_errors[i * 4 + 2] *= iw;
+ component_errors[i * 4 + 3] = 0; // radius
+ }
+
+ // compute squared radius for each component
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int c = components[i];
+
+ float dx = vertex_positions[i].x - component_errors[c * 4 + 0];
+ float dy = vertex_positions[i].y - component_errors[c * 4 + 1];
+ float dz = vertex_positions[i].z - component_errors[c * 4 + 2];
+ float r = dx * dx + dy * dy + dz * dz;
+
+ component_errors[c * 4 + 3] = component_errors[c * 4 + 3] < r ? r : component_errors[c * 4 + 3];
+ }
+
+ // we've used the output buffer as scratch space, so we need to move the results to proper indices
+ for (size_t i = 0; i < component_count; ++i)
+ {
+#if TRACE >= 2
+ printf("component %d: center %f %f %f, error %e\n", int(i),
+ component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3]));
+#endif
+ // note: we keep the squared error to make it match quadric error metric
+ component_errors[i] = component_errors[i * 4 + 3];
+ }
+}
+
+static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror)
+{
+ (void)component_count;
+
+ size_t write = 0;
+ float min_error = FLT_MAX;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int v0 = indices[i + 0], v1 = indices[i + 1], v2 = indices[i + 2];
+ unsigned int c = components[v0];
+ assert(c == components[v1] && c == components[v2]);
+
+ if (component_errors[c] > error_cutoff)
+ {
+ min_error = min_error > component_errors[c] ? component_errors[c] : min_error;
+
+ indices[write + 0] = v0;
+ indices[write + 1] = v1;
+ indices[write + 2] = v2;
+ write += 3;
+ }
+ }
+
+#if TRACE
+ size_t pruned_components = 0;
+ for (size_t i = 0; i < component_count; ++i)
+ pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff);
+
+ printf("pruned %d triangles in %d components (goal %e); next %e\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff), min_error < FLT_MAX ? sqrtf(min_error) : min_error * 2);
+#endif
+
+ // update next error with the smallest error of the remaining components
+ nexterror = min_error;
+ return write;
+}
+
struct CellHasher
{
const unsigned int* vertex_ids;
@@ -1299,7 +2080,7 @@ struct TriangleHasher
}
};
-static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, const unsigned char* vertex_lock, size_t vertex_count, int grid_size)
{
assert(grid_size >= 1 && grid_size <= 1024);
float cell_scale = float(grid_size - 1);
@@ -1312,7 +2093,10 @@ static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_pos
int yi = int(v.y * cell_scale + 0.5f);
int zi = int(v.z * cell_scale + 0.5f);
- vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+ if (vertex_lock && (vertex_lock[i] & meshopt_SimplifyVertex_Lock))
+ vertex_ids[i] = (1 << 30) | unsigned(i);
+ else
+ vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
}
}
@@ -1541,17 +2325,17 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
// three point interpolation from "revenge of interpolation search" paper
float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
- return x1 + num / den;
+ return x1 + (den == 0.f ? 0.f : num / den);
}
} // namespace meshopt
-#ifndef NDEBUG
-// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds
-MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = NULL;
-#endif
+// Note: this is only exposed for development purposes; do *not* use
+enum
+{
+ meshopt_SimplifyInternalSolve = 1 << 29,
+ meshopt_SimplifyInternalDebug = 1 << 30
+};
size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
@@ -1561,10 +2345,13 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(target_index_count <= index_count);
- assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute)) == 0);
+ assert(target_error >= 0);
+ assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyRegularize | meshopt_SimplifyPermissive | meshopt_SimplifyInternalSolve | meshopt_SimplifyInternalDebug)) == 0);
assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256);
assert(vertex_attributes_stride % sizeof(float) == 0);
assert(attribute_count <= kMaxAttributes);
+ for (size_t i = 0; i < attribute_count; ++i)
+ assert(attribute_weights[i] >= 0);
meshopt_Allocator allocator;
@@ -1584,6 +2371,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL);
// build position remap that maps each vertex to the one with identical position
+ // wedge table stores next vertex with identical position for each vertex
unsigned int* remap = allocator.allocate(vertex_count);
unsigned int* wedge = allocator.allocate(vertex_count);
buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator);
@@ -1610,14 +2398,23 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
#endif
Vector3* vertex_positions = allocator.allocate(vertex_count);
- float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap);
+ float vertex_offset[3] = {};
+ float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, vertex_offset);
float* vertex_attributes = NULL;
+ unsigned int attribute_remap[kMaxAttributes];
if (attribute_count)
{
+ // remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics
+ size_t attributes_used = 0;
+ for (size_t i = 0; i < attribute_count; ++i)
+ if (attribute_weights[i] > 0)
+ attribute_remap[attributes_used++] = unsigned(i);
+
+ attribute_count = attributes_used;
vertex_attributes = allocator.allocate(vertex_count * attribute_count);
- rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, sparse_remap);
+ rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, attribute_remap, sparse_remap);
}
Quadric* vertex_quadrics = allocator.allocate(vertex_count);
@@ -1625,6 +2422,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
Quadric* attribute_quadrics = NULL;
QuadricGrad* attribute_gradients = NULL;
+ QuadricGrad* volume_gradients = NULL;
if (attribute_count)
{
@@ -1633,13 +2431,42 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
attribute_gradients = allocator.allocate(vertex_count * attribute_count);
memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad));
+
+ if (options & meshopt_SimplifyInternalSolve)
+ {
+ volume_gradients = allocator.allocate(vertex_count);
+ memset(volume_gradients, 0, vertex_count * sizeof(QuadricGrad));
+ }
}
- fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap);
+ fillFaceQuadrics(vertex_quadrics, volume_gradients, result, index_count, vertex_positions, remap);
+ fillVertexQuadrics(vertex_quadrics, vertex_positions, vertex_count, remap, options);
fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
if (attribute_count)
- fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count, remap);
+ fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count);
+
+ unsigned int* components = NULL;
+ float* component_errors = NULL;
+ size_t component_count = 0;
+ float component_nexterror = 0;
+
+ if (options & meshopt_SimplifyPrune)
+ {
+ components = allocator.allocate(vertex_count);
+ component_count = buildComponents(components, vertex_count, result, index_count, remap);
+
+ component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents
+ measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+ component_nexterror = FLT_MAX;
+ for (size_t i = 0; i < component_count; ++i)
+ component_nexterror = component_nexterror > component_errors[i] ? component_errors[i] : component_nexterror;
+
+#if TRACE
+ printf("components: %d (min error %e)\n", int(component_count), sqrtf(component_nexterror));
+#endif
+ }
#if TRACE
size_t pass_count = 0;
@@ -1654,6 +2481,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
size_t result_count = index_count;
float result_error = 0;
+ float vertex_error = 0;
// target_error input is linear; we need to adjust it to match quadricError units
float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f;
@@ -1664,14 +2492,18 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
// note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
- size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop);
+ size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop, loopback);
assert(edge_collapse_count <= collapse_capacity);
// no edges can be collapsed any more due to topology restrictions
if (edge_collapse_count == 0)
break;
- rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
+#if TRACE
+ printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
+#endif
+
+ rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback);
sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
@@ -1682,39 +2514,101 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
memset(collapse_locked, 0, vertex_count);
-#if TRACE
- printf("pass %d: ", int(pass_count++));
-#endif
-
- size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
+ size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
if (collapses == 0)
break;
+ updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error);
+
+ // updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent
+ vertex_error = attribute_count == 0 ? result_error : vertex_error;
+
+ // note: we update loops following edge collapses, but after this we might still have stale loop data
+ // this can happen when a triangle with a loop edge gets collapsed along a non-loop edge
+ // that works since a loop that points to a vertex that is no longer connected is not affecting collapse logic
remapEdgeLoops(loop, vertex_count, collapse_remap);
remapEdgeLoops(loopback, vertex_count, collapse_remap);
- size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
- assert(new_count < result_count);
+ result_count = remapIndexBuffer(result, result_count, collapse_remap, remap);
+ if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error)
+ result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror);
+ }
+
+ // at this point, component_nexterror might be stale: component it references may have been removed through a series of edge collapses
+ bool component_nextstale = true;
+
+ // we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit
+ while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit)
+ {
+#if TRACE
+ printf("pass %d: cleanup; ", int(pass_count++));
+#endif
+
+ float component_cutoff = component_nexterror * 1.5f < error_limit ? component_nexterror * 1.5f : error_limit;
+
+ // track maximum error in eligible components as we are increasing resulting error
+ float component_maxerror = 0;
+ for (size_t i = 0; i < component_count; ++i)
+ if (component_errors[i] > component_maxerror && component_errors[i] <= component_cutoff)
+ component_maxerror = component_errors[i];
+
+ size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror);
+ if (new_count == result_count && !component_nextstale)
+ break;
+
+ component_nextstale = false; // pruneComponents guarantees next error is up to date
result_count = new_count;
+ result_error = result_error < component_maxerror ? component_maxerror : result_error;
+ vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error;
}
#if TRACE
- printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count));
+ printf("result: %d triangles, error: %e (pos %.3e); total %d passes\n", int(result_count / 3), sqrtf(result_error), sqrtf(vertex_error), int(pass_count));
#endif
-#ifndef NDEBUG
- if (meshopt_simplifyDebugKind)
- memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+ // if solve is requested, update input buffers destructively from internal data
+ if (options & meshopt_SimplifyInternalSolve)
+ {
+ unsigned char* vertex_update = collapse_locked; // reuse as scratch space
+ memset(vertex_update, 0, vertex_count);
- if (meshopt_simplifyDebugLoop)
- memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+ // limit quadric solve to vertices that are still used in the result
+ for (size_t i = 0; i < result_count; ++i)
+ {
+ unsigned int v = result[i];
- if (meshopt_simplifyDebugLoopBack)
- memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
-#endif
+ // mark the vertex for finalizeVertices and root vertex for solve*
+ vertex_update[remap[v]] = vertex_update[v] = 1;
+ }
+
+ // edge adjacency may be stale as we haven't updated it after last series of edge collapses
+ updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
+
+ solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
+
+ if (attribute_count)
+ solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
+
+ finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
+ }
+
+ // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
+ if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap)
+ {
+ assert(Kind_Count <= 8 && vertex_count < (1 << 28)); // 3 bit kind, 1 bit loop
+
+ for (size_t i = 0; i < result_count; i += 3)
+ {
+ unsigned int a = result[i + 0], b = result[i + 1], c = result[i + 2];
+
+ result[i + 0] |= (vertex_kind[a] << 28) | (unsigned(loop[a] == b || loopback[b] == a) << 31);
+ result[i + 1] |= (vertex_kind[b] << 28) | (unsigned(loop[b] == c || loopback[c] == b) << 31);
+ result[i + 2] |= (vertex_kind[c] << 28) | (unsigned(loop[c] == a || loopback[a] == c) << 31);
+ }
+ }
// convert resulting indices back into the dense space of the larger mesh
if (sparse_remap)
@@ -1730,15 +2624,24 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
+ assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error);
}
size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
+ assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error);
}
-size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
+size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+{
+ return meshopt_simplifyEdge(indices, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options | meshopt_SimplifyInternalSolve, out_result_error);
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* out_result_error)
{
using namespace meshopt;
@@ -1766,15 +2669,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
const int kInterpolationPasses = 5;
// invariant: # of triangles in min_grid <= target_count
- int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error));
+ int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : (target_error < 1.f ? target_error : 1.f)));
int max_grid = 1025;
size_t min_triangles = 0;
size_t max_triangles = index_count / 3;
// when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid
- if (min_grid > 1)
+ if (min_grid > 1 || vertex_lock)
{
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
min_triangles = countTriangles(vertex_ids, indices, index_count);
}
@@ -1790,7 +2693,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
int grid_size = next_grid_size;
grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, grid_size);
size_t triangles = countTriangles(vertex_ids, indices, index_count);
#if TRACE
@@ -1800,7 +2703,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
(triangles <= target_index_count / 3) ? "under" : "over");
#endif
- float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+ float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
if (triangles <= target_index_count / 3)
{
@@ -1832,7 +2735,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
unsigned int* vertex_cells = allocator.allocate(vertex_count);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
// build a quadric for each target cell
@@ -1853,15 +2756,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
for (size_t i = 0; i < cell_count; ++i)
result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error;
- // collapse triangles!
- // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+ // vertex collapses often result in duplicate triangles; we need a table to filter them out
size_t tritable_size = hashBuckets2(min_triangles);
unsigned int* tritable = allocator.allocate(tritable_size);
+ // note: this is the first and last write to destination, which allows aliasing destination with indices
size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
#if TRACE
- printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
+ printf("result: grid size %d, %d cells, %d triangles (%d unfiltered), error %e\n", min_grid, int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
#endif
if (out_result_error)
@@ -1870,6 +2773,40 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
return write;
}
+size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_error >= 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* result = destination;
+ if (result != indices)
+ memcpy(result, indices, index_count * sizeof(unsigned int));
+
+ // build position remap that maps each vertex to the one with identical position
+ unsigned int* remap = allocator.allocate(vertex_count);
+ buildPositionRemap(remap, NULL, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, allocator);
+
+ Vector3* vertex_positions = allocator.allocate(vertex_count);
+ rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, NULL);
+
+ unsigned int* components = allocator.allocate(vertex_count);
+ size_t component_count = buildComponents(components, vertex_count, indices, index_count, remap);
+
+ float* component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents
+ measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+ float component_nexterror = 0;
+ size_t result_count = pruneComponents(result, index_count, components, component_errors, component_count, target_error * target_error, component_nexterror);
+
+ return result_count;
+}
+
size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count)
{
using namespace meshopt;
@@ -1922,7 +2859,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
int grid_size = next_grid_size;
grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, grid_size);
size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
#if TRACE
@@ -1959,7 +2896,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
unsigned int* vertex_cells = allocator.allocate(vertex_count);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, min_grid);
size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
// accumulate points into a reservoir for each target cell
@@ -1972,7 +2909,10 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
unsigned int* cell_remap = allocator.allocate(cell_count);
float* cell_errors = allocator.allocate(cell_count);
- fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight * color_weight, vertex_count);
+ // we scale the color weight to bring it to the same scale as position so that error addition makes sense
+ float color_weight_scaled = color_weight * (min_grid == 1 ? 1.f : 1.f / (min_grid - 1));
+
+ fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight_scaled * color_weight_scaled, vertex_count);
// copy results to the output
assert(cell_count <= target_vertex_count);
diff --git a/Source/ThirdParty/meshoptimizer/spatialorder.cpp b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
index 7b1a06945..8a785fcd5 100644
--- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp
+++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
@@ -10,18 +10,19 @@
namespace meshopt
{
-// "Insert" two 0 bits after each of the 10 low bits of x
-inline unsigned int part1By2(unsigned int x)
+// "Insert" two 0 bits after each of the 20 low bits of x
+inline unsigned long long part1By2(unsigned long long x)
{
- x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
- x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
- x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
- x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
- x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+ x &= 0x000fffffull; // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
+ x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
+ x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
+ x = (x ^ (x << 8)) & 0x000f00f00f00f00full; // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
+ x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull; // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
+ x = (x ^ (x << 2)) & 0x0249249249249249ull; // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
-static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
@@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat
extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
- float scale = extent == 0 ? 0.f : 1.f / extent;
+ // rescale each axis to 16 bits to get 48-bit Morton codes
+ float scale = extent == 0 ? 0.f : 65535.f / extent;
// generate Morton order based on the position inside a unit cube
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions_data + i * vertex_stride_float;
- int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
- int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
- int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+ int x = int((v[0] - minv[0]) * scale + 0.5f);
+ int y = int((v[1] - minv[1]) * scale + 0.5f);
+ int z = int((v[2] - minv[2]) * scale + 0.5f);
- result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+ if (morton)
+ result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+ else
+ result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
}
}
-static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
{
+ unsigned int hist[1024];
memset(hist, 0, sizeof(hist));
- // compute 3 10-bit histograms in parallel
+ // compute histogram (assume keys are 10-bit)
for (size_t i = 0; i < count; ++i)
- {
- unsigned int id = data[i];
+ hist[keys[i]]++;
- hist[(id >> 0) & 1023][0]++;
- hist[(id >> 10) & 1023][1]++;
- hist[(id >> 20) & 1023][2]++;
- }
-
- unsigned int sumx = 0, sumy = 0, sumz = 0;
+ unsigned int sum = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 1024; ++i)
{
- unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
-
- hist[i][0] = sumx;
- hist[i][1] = sumy;
- hist[i][2] = sumz;
-
- sumx += hx;
- sumy += hy;
- sumz += hz;
+ unsigned int h = hist[i];
+ hist[i] = sum;
+ sum += h;
}
- assert(sumx == count && sumy == count && sumz == count);
+ assert(sum == count);
+
+ // reorder values
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = keys[source[i]];
+
+ destination[hist[id]++] = source[i];
+ }
}
-static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
{
- int bitoff = pass * 10;
+ memset(hist, 0, sizeof(hist));
+
+ // compute 2 8-bit histograms in parallel
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned long long id = data[i];
+
+ hist[(id >> 0) & 255][0]++;
+ hist[(id >> 8) & 255][1]++;
+ }
+
+ unsigned int sum0 = 0, sum1 = 0;
+
+ // replace histogram data with prefix histogram sums in-place
+ for (int i = 0; i < 256; ++i)
+ {
+ unsigned int h0 = hist[i][0], h1 = hist[i][1];
+
+ hist[i][0] = sum0;
+ hist[i][1] = sum1;
+
+ sum0 += h0;
+ sum1 += h1;
+ }
+
+ assert(sum0 == count && sum1 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
+{
+ int bitoff = pass * 8;
for (size_t i = 0; i < count; ++i)
{
- unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+ unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
destination[hist[id][pass]++] = source[i];
}
}
+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+ size_t l = 0, r = split;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned char side = sides[order[i]];
+ target[side ? r : l] = order[i];
+ l += 1;
+ l -= side;
+ r += side;
+ }
+
+ assert(l == split && r == count);
+}
+
+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
+{
+ if (count <= cluster_size)
+ {
+ memcpy(destination, orderx, count * sizeof(unsigned int));
+ return;
+ }
+
+ unsigned int* axes[3] = {orderx, ordery, orderz};
+
+ int bestk = -1;
+ unsigned int bestdim = 0;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ const unsigned int mask = (1 << 20) - 1;
+ unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
+
+ if (dim >= bestdim)
+ {
+ bestk = k;
+ bestdim = dim;
+ }
+ }
+
+ assert(bestk >= 0);
+
+ // split roughly in half, with the left split always being aligned to cluster size
+ size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
+ assert(split > 0 && split < count);
+
+ // mark sides of split for partitioning
+ unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int);
+
+ for (size_t i = 0; i < split; ++i)
+ sides[axes[bestk][i]] = 0;
+
+ for (size_t i = split; i < count; ++i)
+ sides[axes[bestk][i]] = 1;
+
+ // partition all axes into two sides, maintaining order
+ unsigned int* temp = static_cast(scratch);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ if (k == bestk)
+ continue;
+
+ unsigned int* axis = axes[k];
+ memcpy(temp, axis, sizeof(unsigned int) * count);
+ partitionPoints(axis, temp, sides, split, count);
+ }
+
+ // recursion depth is logarithmic and bounded as we always split in approximately half
+ splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
+ splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
+}
+
} // namespace meshopt
void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
meshopt_Allocator allocator;
- unsigned int* keys = allocator.allocate(vertex_count);
- computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+ unsigned long long* keys = allocator.allocate(vertex_count);
+ computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
- unsigned int hist[1024][3];
- computeHistogram(hist, keys, vertex_count);
-
- unsigned int* scratch = allocator.allocate(vertex_count);
+ unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 2b for keys
+ unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
destination[i] = unsigned(i);
- // 3-pass radix sort computes the resulting order into scratch
- radixPass(scratch, destination, keys, vertex_count, hist, 0);
- radixPass(destination, scratch, keys, vertex_count, hist, 1);
- radixPass(scratch, destination, keys, vertex_count, hist, 2);
+ unsigned int* order[] = {scratch, destination};
+
+ // 5-pass radix sort computes the resulting order into scratch
+ for (int k = 0; k < 5; ++k)
+ {
+ // copy 10-bit key segments into keyk to reduce cache pressure during radix pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
+
+ radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
+ }
// since our remap table is mapping old=>new, we need to reverse it
for (size_t i = 0; i < vertex_count; ++i)
@@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
destination[r * 3 + 2] = c;
}
}
+
+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(cluster_size > 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned long long* keys = allocator.allocate(vertex_count);
+ computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
+
+ unsigned int* order = allocator.allocate(vertex_count * 3);
+ unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
+ unsigned short* keyk = reinterpret_cast(scratch + vertex_count);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ // copy 16-bit key segments into keyk to reduce cache pressure during radix pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ keyk[i] = (unsigned short)(keys[i] >> (k * 20));
+
+ unsigned int hist[256][2];
+ computeHistogram(hist, keyk, vertex_count);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ order[k * vertex_count + i] = unsigned(i);
+
+ radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
+ radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
+ }
+
+ splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
+}
diff --git a/Source/ThirdParty/meshoptimizer/stripifier.cpp b/Source/ThirdParty/meshoptimizer/stripifier.cpp
index d57fb512b..4043195ae 100644
--- a/Source/ThirdParty/meshoptimizer/stripifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/stripifier.cpp
@@ -10,14 +10,14 @@
namespace meshopt
{
-static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
{
unsigned int index = 0;
unsigned int iv = ~0u;
for (size_t i = 0; i < buffer_size; ++i)
{
- unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+ unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
if (v < iv)
@@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
size_t strip_size = 0;
// compute vertex valence; this is used to prioritize starting triangle for strips
- unsigned int* valence = allocator.allocate(vertex_count);
- memset(valence, 0, vertex_count * sizeof(unsigned int));
+ // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+ unsigned char* valence = allocator.allocate(vertex_count);
+ memset(valence, 0, vertex_count);
for (size_t i = 0; i < index_count; ++i)
{
@@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
{
// if we didn't find anything, we need to find the next new triangle
// we use a heuristic to maximize the strip length
- unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+ unsigned int i = findStripFirst(buffer, buffer_size, valence);
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
// ordered removal from the buffer
diff --git a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
index 94f7a1adc..7085cce32 100644
--- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
@@ -60,6 +60,15 @@
#define SIMD_LATENCYOPT
#endif
+// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
+#if defined(__GNUC__)
+#define SIMD_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define SIMD_UNREACHABLE() __assume(false)
+#else
+#define SIMD_UNREACHABLE() assert(!"Unreachable")
+#endif
+
#endif // !MESHOPTIMIZER_NO_SIMD
#ifdef SIMD_SSE
@@ -90,6 +99,14 @@
#include
#endif
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include
+#endif
+
#ifdef SIMD_WASM
#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
@@ -105,50 +122,76 @@ namespace meshopt
const unsigned char kVertexHeader = 0xa0;
-static int gEncodeVertexVersion = 0;
+static int gEncodeVertexVersion = 1;
+const int kDecodeVertexVersion = 1;
const size_t kVertexBlockSizeBytes = 8192;
const size_t kVertexBlockMaxSize = 256;
const size_t kByteGroupSize = 16;
const size_t kByteGroupDecodeLimit = 24;
-const size_t kTailMaxSize = 32;
+const size_t kTailMinSizeV0 = 32;
+const size_t kTailMinSizeV1 = 24;
+
+static const int kBitsV0[4] = {0, 2, 4, 8};
+static const int kBitsV1[5] = {0, 1, 2, 4, 8};
+
+const int kEncodeDefaultLevel = 2;
static size_t getVertexBlockSize(size_t vertex_size)
{
- // make sure the entire block fits into the scratch buffer
- size_t result = kVertexBlockSizeBytes / vertex_size;
-
- // align to byte group size; we encode each byte as a byte group
- // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
- result &= ~(kByteGroupSize - 1);
+ // make sure the entire block fits into the scratch buffer and is aligned to byte group size
+ // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
+ size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
}
-inline unsigned char zigzag8(unsigned char v)
+inline unsigned int rotate(unsigned int v, int r)
{
- return ((signed char)(v) >> 7) ^ (v << 1);
+ return (v << r) | (v >> ((32 - r) & 31));
}
-inline unsigned char unzigzag8(unsigned char v)
+template
+inline T zigzag(T v)
{
- return -(v & 1) ^ (v >> 1);
+ return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
}
+template
+inline T unzigzag(T v)
+{
+ return (0 - (v & 1)) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+ size_t size;
+ size_t header; // bytes for header
+ size_t bitg[9]; // bytes for bit groups
+ size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+ size_t ctrl[4]; // number of control groups
+};
+
+static Stats* bytestats = NULL;
+static Stats vertexstats[256];
+#endif
+
static bool encodeBytesGroupZero(const unsigned char* buffer)
{
- for (size_t i = 0; i < kByteGroupSize; ++i)
- if (buffer[i])
- return false;
+ assert(kByteGroupSize == sizeof(unsigned long long) * 2);
- return true;
+ unsigned long long v[2];
+ memcpy(v, buffer, sizeof(v));
+
+ return (v[0] | v[1]) == 0;
}
static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
{
- assert(bits >= 1 && bits <= 8);
+ assert(bits >= 0 && bits <= 8);
- if (bits == 1)
+ if (bits == 0)
return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
if (bits == 8)
@@ -166,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
{
- assert(bits >= 1 && bits <= 8);
+ assert(bits >= 0 && bits <= 8);
+ assert(kByteGroupSize % 8 == 0);
- if (bits == 1)
+ if (bits == 0)
return data;
if (bits == 8)
@@ -196,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char*
byte |= enc;
}
+ // encode 1-bit groups in reverse bit order
+ // this makes them faster to decode alongside other groups
+ if (bits == 1)
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+
*data++ = byte;
}
for (size_t i = 0; i < kByteGroupSize; ++i)
{
- if (buffer[i] >= sentinel)
- {
- *data++ = buffer[i];
- }
+ unsigned char v = buffer[i];
+
+ // branchless append of out-of-range values
+ *data = v;
+ data += v >= sentinel;
}
return data;
}
-static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
{
assert(buffer_size % kByteGroupSize == 0);
@@ -226,69 +276,301 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
memset(header, 0, header_size);
+ int last_bits = -1;
+
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
{
if (size_t(data_end - data) < kByteGroupDecodeLimit)
return NULL;
- int best_bits = 8;
- size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+ int best_bitk = 3;
+ size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
- for (int bits = 1; bits < 8; bits *= 2)
+ for (int bitk = 0; bitk < 3; ++bitk)
{
- size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+ size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
- if (size < best_size)
+ // favor consistent bit selection across groups, but never replace literals
+ if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
{
- best_bits = bits;
+ best_bitk = bitk;
best_size = size;
}
}
- int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
- assert((1 << bitslog2) == best_bits);
-
size_t header_offset = i / kByteGroupSize;
+ header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
- header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
-
+ int best_bits = bits[best_bitk];
unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
assert(data + best_size == next);
data = next;
+ last_bits = best_bits;
+
+#if TRACE
+ bytestats->bitg[best_bits] += best_size;
+#endif
}
+#if TRACE
+ bytestats->header += header_size;
+#endif
+
return data;
}
-static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template
+static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
+{
+ size_t k0 = k & ~(sizeof(T) - 1);
+ int ks = (k & (sizeof(T) - 1)) * 8;
+
+ T p = last_vertex[k0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ p |= T(last_vertex[k0 + j]) << (j * 8);
+
+ const unsigned char* vertex = vertex_data + k0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ T v = vertex[0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ v |= vertex[j] << (j * 8);
+
+ T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
+
+ buffer[i] = (unsigned char)(d >> ks);
+ p = v;
+ vertex += vertex_size;
+ }
+}
+
+static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
+{
+ switch (channel & 3)
+ {
+ case 0:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+ case 1:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+ case 2:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
+ default:
+ assert(!"Unsupported channel encoding"); // unreachable
+ }
+}
+
+static int estimateBits(unsigned char v)
+{
+ return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
+}
+
+static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
+{
+ size_t sizes[8] = {};
+
+ const unsigned char* vertex = vertex_data + k;
+ unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+
+ for (size_t i = 0; i < vertex_count; i += group_size)
+ {
+ unsigned int bitg = 0;
+
+ // calculate bit consistency mask for the group
+ for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
+ {
+ unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+ unsigned int d = v ^ last;
+
+ bitg |= d;
+ last = v;
+ vertex += vertex_size;
+ }
+
+#if TRACE
+ for (int j = 0; j < 32; ++j)
+ vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
+#endif
+
+ for (int j = 0; j < 8; ++j)
+ {
+ unsigned int bitr = rotate(bitg, j);
+
+ sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
+ sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
+ }
+ }
+
+ int best_rot = 0;
+ for (int rot = 1; rot < 8; ++rot)
+ best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
+
+ return best_rot;
+}
+
+static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
+{
+ unsigned char block[kVertexBlockMaxSize];
+ assert(vertex_block_size <= kVertexBlockMaxSize);
+
+ unsigned char last_vertex[256] = {};
+
+ size_t sizes[3] = {};
+ assert(max_channel <= 3);
+
+ for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
+ {
+ size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
+ size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+ memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
+
+ // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+ if (block_size < block_size_aligned)
+ memset(block + block_size, 0, block_size_aligned - block_size);
+
+ for (int channel = 0; channel < max_channel; ++channel)
+ for (size_t j = 0; j < 4; ++j)
+ {
+ encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
+
+ for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
+ {
+ // to maximize encoding performance we only evaluate 1/2/4/8 bit groups
+ size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
+ size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
+ size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
+ size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
+
+ size_t best_size = size1 < size2 ? size1 : size2;
+ best_size = best_size < size4 ? best_size : size4;
+ best_size = best_size < size8 ? best_size : size8;
+
+ sizes[channel] += best_size;
+ }
+ }
+ }
+
+ int best_channel = 0;
+ for (int channel = 1; channel < max_channel; ++channel)
+ best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
+
+ return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
+}
+
+static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
+{
+ for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+ if (!encodeBytesGroupZero(buffer + i))
+ return false;
+
+ return true;
+}
+
+static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
+{
+ if (estimateControlZero(buffer, vertex_count_aligned))
+ return 2; // zero encoding
+
+ if (level == 0)
+ return 1; // 1248 encoding in level 0 for encoding speed
+
+ // round number of groups to 4 to get number of header bytes
+ size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
+
+ size_t est_bytes0 = header_size, est_bytes1 = header_size;
+
+ for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+ {
+ // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
+ size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
+ size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
+ size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
+ size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
+ size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
+
+ // both control modes have access to 1/2/4 bit encoding
+ size_t size12 = size1 < size2 ? size1 : size2;
+ size_t size124 = size12 < size4 ? size12 : size4;
+
+ // each control mode has access to 0/8 bit encoding respectively
+ est_bytes0 += size124 < size0 ? size124 : size0;
+ est_bytes1 += size124 < size8 ? size124 : size8;
+ }
+
+ // pick shortest control entry but prefer literal encoding
+ if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
+ return est_bytes0 < est_bytes1 ? 0 : 1;
+ else
+ return 3; // literal encoding
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
{
assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+ assert(vertex_size % 4 == 0);
unsigned char buffer[kVertexBlockMaxSize];
assert(sizeof(buffer) % kByteGroupSize == 0);
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
memset(buffer, 0, sizeof(buffer));
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ unsigned char* control = data;
+ data += control_size;
+
+ memset(control, 0, control_size);
+
for (size_t k = 0; k < vertex_size; ++k)
{
- size_t vertex_offset = k;
+ encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
- unsigned char p = last_vertex[k];
+#if TRACE
+ const unsigned char* olddata = data;
+ bytestats = &vertexstats[k];
+#endif
- for (size_t i = 0; i < vertex_count; ++i)
+ int ctrl = 0;
+
+ if (version != 0)
{
- buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+ ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
- p = vertex_data[vertex_offset];
+ assert(unsigned(ctrl) < 4);
+ control[k / 4] |= ctrl << ((k % 4) * 2);
- vertex_offset += vertex_size;
+#if TRACE
+ vertexstats[k].ctrl[ctrl]++;
+#endif
}
- data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
- if (!data)
- return NULL;
+ if (ctrl == 3)
+ {
+ // literal encoding
+ if (size_t(data_end - data) < vertex_count)
+ return NULL;
+
+ memcpy(data, buffer, vertex_count);
+ data += vertex_count;
+ }
+ else if (ctrl != 2) // non-zero encoding
+ {
+ data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+ if (!data)
+ return NULL;
+ }
+
+#if TRACE
+ bytestats = NULL;
+ vertexstats[k].size += data - olddata;
+#endif
}
memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -297,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
}
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
-static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
{
#define READ() byte = *data++
#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
@@ -305,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
unsigned char byte, enc, encv;
const unsigned char* data_var;
- switch (bitslog2)
+ switch (bits)
{
case 0:
memset(buffer, 0, kByteGroupSize);
return data;
case 1:
+ data_var = data + 2;
+
+ // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
+ READ();
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+ NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+ READ();
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+ NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+
+ return data_var;
+ case 2:
data_var = data + 4;
// 4 groups with 4 2-bit values in each byte
@@ -320,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
return data_var;
- case 2:
+ case 4:
data_var = data + 8;
// 8 groups with 2 4-bit values in each byte
@@ -334,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
READ(), NEXT(4), NEXT(4);
return data_var;
- case 3:
+ case 8:
memcpy(buffer, data, kByteGroupSize);
return data + kByteGroupSize;
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ assert(!"Unexpected bit length"); // unreachable
return data;
}
@@ -346,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
#undef NEXT
}
-static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
{
assert(buffer_size % kByteGroupSize == 0);
- const unsigned char* header = data;
-
// round number of groups to 4 to get number of header bytes
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
if (size_t(data_end - data) < header_size)
return NULL;
+ const unsigned char* header = data;
data += header_size;
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
@@ -366,43 +658,109 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
return NULL;
size_t header_offset = i / kByteGroupSize;
+ int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
- int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
- data = decodeBytesGroup(data, buffer + i, bitslog2);
+ data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
}
return data;
}
-static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template
+static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
{
- assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
-
- unsigned char buffer[kVertexBlockMaxSize];
- unsigned char transposed[kVertexBlockSizeBytes];
-
- size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
-
- for (size_t k = 0; k < vertex_size; ++k)
+ for (size_t k = 0; k < 4; k += sizeof(T))
{
- data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
- if (!data)
- return NULL;
-
size_t vertex_offset = k;
- unsigned char p = last_vertex[k];
+ T p = last_vertex[0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ p |= last_vertex[j] << (8 * j);
for (size_t i = 0; i < vertex_count; ++i)
{
- unsigned char v = unzigzag8(buffer[i]) + p;
+ T v = buffer[i];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ v |= buffer[i + vertex_count * j] << (8 * j);
+
+ v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
+
+ for (size_t j = 0; j < sizeof(T); ++j)
+ transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
- transposed[vertex_offset] = v;
p = v;
vertex_offset += vertex_size;
}
+
+ buffer += vertex_count * sizeof(T);
+ last_vertex += sizeof(T);
+ }
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
+{
+ assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+ unsigned char buffer[kVertexBlockMaxSize * 4];
+ unsigned char transposed[kVertexBlockSizeBytes];
+
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+ assert(vertex_count <= vertex_count_aligned);
+
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ const unsigned char* control = data;
+ data += control_size;
+
+ for (size_t k = 0; k < vertex_size; k += 4)
+ {
+ unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
+ for (size_t j = 0; j < 4; ++j)
+ {
+ int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+ if (ctrl == 3)
+ {
+ // literal encoding
+ if (size_t(data_end - data) < vertex_count)
+ return NULL;
+
+ memcpy(buffer + j * vertex_count, data, vertex_count);
+ data += vertex_count;
+ }
+ else if (ctrl == 2)
+ {
+ // zero encoding
+ memset(buffer + j * vertex_count, 0, vertex_count);
+ }
+ else
+ {
+ data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+ if (!data)
+ return NULL;
+ }
+ }
+
+ int channel = version == 0 ? 0 : channels[k / 4];
+
+ switch (channel & 3)
+ {
+ case 0:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+ break;
+ case 1:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+ break;
+ case 2:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+ break;
+ default:
+ return NULL; // invalid channel type
+ }
}
memcpy(vertex_data, transposed, vertex_count * vertex_size);
@@ -447,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
#ifdef SIMD_SSE
SIMD_TARGET
-static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{
__m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0]));
__m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1]));
@@ -459,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
}
SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
__m128i result = _mm_setzero_si128();
@@ -473,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
#ifdef __GNUC__
typedef int __attribute__((aligned(1))) unaligned_int;
@@ -505,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
unsigned char mask1 = (unsigned char)(mask16 >> 8);
__m128i shuf = decodeShuffleMask(mask0, mask1);
-
__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -518,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
#ifdef SIMD_LATENCYOPT
unsigned long long data64;
@@ -541,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
unsigned char mask1 = (unsigned char)(mask16 >> 8);
__m128i shuf = decodeShuffleMask(mask0, mask1);
-
__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -554,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
__m128i result = _mm_loadu_si128(reinterpret_cast(data));
@@ -562,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2));
+
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ __m128i shuf = decodeShuffleMask(mask0, mask1);
+ __m128i result = _mm_shuffle_epi8(rest, shuf);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_AVX
-static const __m128i decodeBytesGroupConfig[] = {
- _mm_set1_epi8(3),
- _mm_set1_epi8(15),
- _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
- _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+static const __m128i kDecodeBytesGroupConfig[8][2] = {
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+ {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
+ {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+ {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
};
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
__m128i result = _mm_setzero_si128();
@@ -590,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data;
}
- case 1:
- case 2:
+ case 5: // 1-bit
+ case 1: // 2-bit
+ case 6:
+ case 2: // 4-bit
+ case 7:
{
- const unsigned char* skip = data + (bitslog2 << 2);
+ const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
__m128i selb = _mm_loadl_epi64(reinterpret_cast(data));
__m128i rest = _mm_loadu_si128(reinterpret_cast(skip));
- __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
- __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+ __m128i sent = kDecodeBytesGroupConfig[hbits][0];
+ __m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
__m128i selw = _mm_shuffle_epi32(selb, 0x44);
__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
@@ -613,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
__m128i result = _mm_loadu_si128(reinterpret_cast(data));
@@ -622,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_NEON
-static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+SIMD_TARGET
+inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
{
uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
@@ -640,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
return vcombine_u8(r0, r1);
}
-static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+SIMD_TARGET
+inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
{
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
const uint64_t magic = 0x000103070f1f3f80ull;
@@ -651,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
}
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
uint8x16_t result = vdupq_n_u8(0);
@@ -665,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
#ifdef SIMD_LATENCYOPT
unsigned int data32;
@@ -702,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
#ifdef SIMD_LATENCYOPT
unsigned long long data64;
@@ -736,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
uint8x16_t result = vld1q_u8(data);
@@ -744,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ uint8x8_t rest0 = vld1_u8(data + 2);
+ uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
+
+ uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
+
+ vst1q_u8(buffer, result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
-static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{
v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
- v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
- sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
+ v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
return wasmx_unpacklo_v64x2(sm0, sm1r);
}
SIMD_TARGET
-static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
{
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
const uint64_t magic = 0x000103070f1f3f80ull;
@@ -777,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
}
SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
v128_t result = wasm_i8x16_splat(0);
@@ -791,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
v128_t sel2 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 4);
@@ -805,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
-
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
@@ -814,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
v128_t sel4 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 8);
@@ -827,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
-
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
@@ -836,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
v128_t result = wasm_v128_load(data);
@@ -844,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ v128_t rest = wasm_v128_load(data + 2);
+
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
+ v128_t result = wasm_i8x16_swizzle(rest, shuf);
+
+ wasm_v128_store(buffer, result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX)
SIMD_TARGET
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{
__m128i t0 = _mm_unpacklo_epi8(x0, x1);
__m128i t1 = _mm_unpackhi_epi8(x0, x1);
@@ -867,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
}
SIMD_TARGET
-static __m128i unzigzag8(__m128i v)
+inline __m128i unzigzag8(__m128i v)
{
__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
return _mm_xor_si128(xl, xr);
}
+
+SIMD_TARGET
+inline __m128i unzigzag16(__m128i v)
+{
+ __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
+ __m128i xr = _mm_srli_epi16(v, 1);
+
+ return _mm_xor_si128(xl, xr);
+}
+
+SIMD_TARGET
+inline __m128i rotate32(__m128i v, int r)
+{
+ return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
+}
#endif
#ifdef SIMD_NEON
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+SIMD_TARGET
+inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
{
uint8x16x2_t t01 = vzipq_u8(x0, x1);
uint8x16x2_t t23 = vzipq_u8(x2, x3);
@@ -891,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_
x3 = vreinterpretq_u8_u16(x23.val[1]);
}
-static uint8x16_t unzigzag8(uint8x16_t v)
+SIMD_TARGET
+inline uint8x16_t unzigzag8(uint8x16_t v)
{
uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
uint8x16_t xr = vshrq_n_u8(v, 1);
return veorq_u8(xl, xr);
}
+
+SIMD_TARGET
+inline uint8x16_t unzigzag16(uint8x16_t v)
+{
+ uint16x8_t vv = vreinterpretq_u16_u8(v);
+ uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
+ uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
+
+ return veorq_u8(xl, xr);
+}
+
+SIMD_TARGET
+inline uint8x16_t rotate32(uint8x16_t v, int r)
+{
+ uint32x4_t v32 = vreinterpretq_u32_u8(v);
+ return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
+}
+
+template
+SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
+{
+ switch (Channel)
+ {
+ case 0:
+ {
+ uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
+ uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+ return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+ }
+ case 1:
+ {
+ uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
+ uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
+ return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
+ }
+ case 2:
+ {
+ uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
+ uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+ return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+ }
+ default:
+ return npi;
+ }
+}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
-static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
{
v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
@@ -916,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
}
SIMD_TARGET
-static v128_t unzigzag8(v128_t v)
+inline v128_t unzigzag8(v128_t v)
{
v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
v128_t xr = wasm_u8x16_shr(v, 1);
return wasm_v128_xor(xl, xr);
}
+
+SIMD_TARGET
+inline v128_t unzigzag16(v128_t v)
+{
+ v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
+ v128_t xr = wasm_u16x8_shr(v, 1);
+
+ return wasm_v128_xor(xl, xr);
+}
+
+SIMD_TARGET
+inline v128_t rotate32(v128_t v, int r)
+{
+ return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
+}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
SIMD_TARGET
-static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
{
assert(buffer_size % kByteGroupSize == 0);
assert(kByteGroupSize == 16);
- const unsigned char* header = data;
-
// round number of groups to 4 to get number of header bytes
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
if (size_t(data_end - data) < header_size)
return NULL;
+ const unsigned char* header = data;
data += header_size;
size_t i = 0;
- // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+ // fast-path: process 4 groups at a time, do a shared bounds check
for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
{
size_t header_offset = i / kByteGroupSize;
unsigned char header_byte = header[header_offset / 4];
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
}
// slow-path: process remaining groups
@@ -963,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
return NULL;
size_t header_offset = i / kByteGroupSize;
+ unsigned char header_byte = header[header_offset / 4];
- int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
- data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+ data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
}
return data;
}
+template
+SIMD_TARGET static void
+decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
+{
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
+#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
+#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
+#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
+#endif
+
+#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
+
+ PREP();
+
+ unsigned char* savep = transposed;
+
+ for (size_t j = 0; j < vertex_count_aligned; j += 16)
+ {
+ LOAD(0);
+ LOAD(1);
+ LOAD(2);
+ LOAD(3);
+
+ transpose8(r0, r1, r2, r3);
+
+ TEMP t0, t1, t2, t3;
+ TEMP npi = pi;
+
+ UNZR(0);
+ GRP4(0);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(1);
+ GRP4(1);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(2);
+ GRP4(2);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(3);
+ GRP4(3);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
+ // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
+ pi = rebase(npi, r0, r1, r2, r3);
+#else
+ (void)npi;
+#endif
+
+#undef UNZR
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+ }
+}
+
SIMD_TARGET
-static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
{
assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
@@ -982,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ const unsigned char* control = data;
+ data += control_size;
+
for (size_t k = 0; k < vertex_size; k += 4)
{
+ unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
for (size_t j = 0; j < 4; ++j)
{
- data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
- if (!data)
- return NULL;
+ int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+ if (ctrl == 3)
+ {
+ // literal encoding; safe to over-copy due to tail
+ if (size_t(data_end - data) < vertex_count_aligned)
+ return NULL;
+
+ memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
+ data += vertex_count;
+ }
+ else if (ctrl == 2)
+ {
+ // zero encoding
+ memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
+ }
+ else
+ {
+ // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
+ int hshift = version == 0 ? 0 : 4 + ctrl;
+
+ data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
+ if (!data)
+ return NULL;
+ }
}
-#if defined(SIMD_SSE) || defined(SIMD_AVX)
-#define TEMP __m128i
-#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex + k))
-#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned))
-#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
-#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
-#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
-#endif
+ int channel = version == 0 ? 0 : channels[k / 4];
-#ifdef SIMD_NEON
-#define TEMP uint8x8_t
-#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex + k), vdup_n_u32(0), 0))
-#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
-#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
-#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
-#endif
-
-#ifdef SIMD_WASM
-#define TEMP v128_t
-#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
-#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
-#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
-#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
-#endif
-
- PREP();
-
- unsigned char* savep = transposed + k;
-
- for (size_t j = 0; j < vertex_count_aligned; j += 16)
+ switch (channel & 3)
{
- LOAD(0);
- LOAD(1);
- LOAD(2);
- LOAD(3);
-
- r0 = unzigzag8(r0);
- r1 = unzigzag8(r1);
- r2 = unzigzag8(r2);
- r3 = unzigzag8(r3);
-
- transpose8(r0, r1, r2, r3);
-
- TEMP t0, t1, t2, t3;
-
- GRP4(0);
- FIXD(0), FIXD(1), FIXD(2), FIXD(3);
- SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
- GRP4(1);
- FIXD(0), FIXD(1), FIXD(2), FIXD(3);
- SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
- GRP4(2);
- FIXD(0), FIXD(1), FIXD(2), FIXD(3);
- SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
- GRP4(3);
- FIXD(0), FIXD(1), FIXD(2), FIXD(3);
- SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-#undef TEMP
-#undef PREP
-#undef LOAD
-#undef GRP4
-#undef FIXD
-#undef SAVE
+ case 0:
+ decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+ break;
+ case 1:
+ decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+ break;
+ case 2:
+ decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+ break;
+ default:
+ return NULL; // invalid channel type
}
}
@@ -1088,23 +1643,29 @@ static unsigned int cpuid = getCpuFeatures();
} // namespace meshopt
-size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version)
{
using namespace meshopt;
assert(vertex_size > 0 && vertex_size <= 256);
assert(vertex_size % 4 == 0);
+ assert(level >= 0 && level <= 9); // only a subset of this range is used right now
+ assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
+
+ version = version < 0 ? gEncodeVertexVersion : version;
+
+#if TRACE
+ memset(vertexstats, 0, sizeof(vertexstats));
+#endif
const unsigned char* vertex_data = static_cast(vertices);
unsigned char* data = buffer;
unsigned char* data_end = buffer + buffer_size;
- if (size_t(data_end - data) < 1 + vertex_size)
+ if (size_t(data_end - data) < 1)
return 0;
- int version = gEncodeVertexVersion;
-
*data++ = (unsigned char)(kVertexHeader | version);
unsigned char first_vertex[256] = {};
@@ -1116,40 +1677,110 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
size_t vertex_block_size = getVertexBlockSize(vertex_size);
+ unsigned char channels[64] = {};
+ if (version != 0 && level > 1 && vertex_count > 1)
+ for (size_t k = 0; k < vertex_size; k += 4)
+ {
+ int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
+ int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
+
+ assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
+ channels[k / 4] = (unsigned char)channel;
+ }
+
size_t vertex_offset = 0;
while (vertex_offset < vertex_count)
{
size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
- data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+ data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
if (!data)
return 0;
vertex_offset += block_size;
}
- size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+ size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+ size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+ size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
- if (size_t(data_end - data) < tail_size)
+ if (size_t(data_end - data) < tail_size_pad)
return 0;
- // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
- if (vertex_size < kTailMaxSize)
+ if (tail_size < tail_size_pad)
{
- memset(data, 0, kTailMaxSize - vertex_size);
- data += kTailMaxSize - vertex_size;
+ memset(data, 0, tail_size_pad - tail_size);
+ data += tail_size_pad - tail_size;
}
memcpy(data, first_vertex, vertex_size);
data += vertex_size;
+ if (version != 0)
+ {
+ memcpy(data, channels, vertex_size / 4);
+ data += vertex_size / 4;
+ }
+
assert(data >= buffer + tail_size);
assert(data <= buffer + buffer_size);
+#if TRACE
+ size_t total_size = data - buffer;
+
+ for (size_t k = 0; k < vertex_size; ++k)
+ {
+ const Stats& vsk = vertexstats[k];
+
+ printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+ size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
+ double total_kr = total_k ? 1.0 / double(total_k) : 0;
+
+ if (version != 0)
+ {
+ int channel = channels[k / 4];
+
+ if ((channel & 3) == 2 && k % 4 == 0)
+ printf(" | ^%d", channel >> 4);
+ else
+ printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
+ }
+
+ printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
+ double(vsk.header) * total_kr * 100,
+ double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
+ double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
+
+ size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
+
+ if (total_ctrl)
+ {
+ printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
+ double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
+ double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
+ }
+
+ if (level >= 3)
+ printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+ double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
+ double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
+ double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
+ double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
+
+ printf("\n");
+ }
+#endif
+
return data - buffer;
}
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion);
+}
+
size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
{
using namespace meshopt;
@@ -1160,21 +1791,42 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
size_t vertex_block_size = getVertexBlockSize(vertex_size);
size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+ size_t vertex_block_control_size = vertex_size / 4;
size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
size_t vertex_block_data_size = vertex_block_size;
- size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+ size_t tail_size = vertex_size + (vertex_size / 4);
+ size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
+ size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+ assert(tail_size_pad >= kByteGroupDecodeLimit);
- return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+ return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
}
void meshopt_encodeVertexVersion(int version)
{
- assert(unsigned(version) <= 0);
+ assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
meshopt::gEncodeVertexVersion = version;
}
+int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+ if (buffer_size < 1)
+ return -1;
+
+ unsigned char header = buffer[0];
+
+ if ((header & 0xf0) != meshopt::kVertexHeader)
+ return -1;
+
+ int version = header & 0x0f;
+ if (version > meshopt::kDecodeVertexVersion)
+ return -1;
+
+ return version;
+}
+
int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
@@ -1182,7 +1834,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
assert(vertex_size > 0 && vertex_size <= 256);
assert(vertex_size % 4 == 0);
- const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
+ const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -1202,7 +1854,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
const unsigned char* data = buffer;
const unsigned char* data_end = buffer + buffer_size;
- if (size_t(data_end - data) < 1 + vertex_size)
+ if (size_t(data_end - data) < 1)
return -2;
unsigned char data_header = *data++;
@@ -1211,11 +1863,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
return -1;
int version = data_header & 0x0f;
- if (version > 0)
+ if (version > kDecodeVertexVersion)
return -1;
+ size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+ size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+ size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+
+ if (size_t(data_end - data) < tail_size_pad)
+ return -2;
+
+ const unsigned char* tail = data_end - tail_size;
+
unsigned char last_vertex[256];
- memcpy(last_vertex, data_end - vertex_size, vertex_size);
+ memcpy(last_vertex, tail, vertex_size);
+
+ const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
size_t vertex_block_size = getVertexBlockSize(vertex_size);
@@ -1225,16 +1888,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
{
size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
- data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+ data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
if (!data)
return -2;
vertex_offset += block_size;
}
- size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
-
- if (size_t(data_end - data) != tail_size)
+ if (size_t(data_end - data) != tail_size_pad)
return -3;
return 0;
@@ -1246,3 +1907,4 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
#undef SIMD_WASM
#undef SIMD_FALLBACK
#undef SIMD_TARGET
+#undef SIMD_LATENCYOPT
diff --git a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
index 4b5f444f0..3fd836083 100644
--- a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
static void decodeFilterQuat(short* data, size_t count)
{
- const float scale = 1.f / sqrtf(2.f);
+ const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; ++i)
{
// recover scale from the high byte of the component
int sf = data[i * 4 + 3] | 3;
- float ss = scale / float(sf);
+ float s = float(sf);
- // convert x/y/z to [-1..1] (scaled...)
- float x = float(data[i * 4 + 0]) * ss;
- float y = float(data[i * 4 + 1]) * ss;
- float z = float(data[i * 4 + 2]) * ss;
+ // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+ float x = float(data[i * 4 + 0]);
+ float y = float(data[i * 4 + 1]);
+ float z = float(data[i * 4 + 2]);
- // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
- float ww = 1.f - x * x - y * y - z * z;
+ // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+ float ws = s * s;
+ float ww = ws * 2.f - x * x - y * y - z * z;
float w = sqrtf(ww >= 0.f ? ww : 0.f);
+ // compute final scale; note that all computations above are unscaled
+ // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+ float ss = scale / s;
+
// rounded signed float->int
- int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
- int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
- int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
- int wf = int(w * 32767.f + 0.5f);
+ int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
+ int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
+ int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
+ int wf = int(w * ss + 0.5f);
int qc = data[i * 4 + 3] & 3;
@@ -165,6 +170,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
data[i] = u.ui;
}
}
+
+template
+static void decodeFilterColor(T* data, size_t count)
+{
+ const float max = float((1 << (sizeof(T) * 8)) - 1);
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ // recover scale from alpha high bit
+ int as = data[i * 4 + 3];
+ as |= as >> 1;
+ as |= as >> 2;
+ as |= as >> 4;
+ as |= as >> 8; // noop for 8-bit
+
+ // convert to RGB in fixed point (co/cg are sign extended)
+ int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
+
+ int r = y + co - cg;
+ int g = y + cg;
+ int b = y - co - cg;
+
+ // expand alpha by one bit to match other components
+ int a = data[i * 4 + 3];
+ a = ((a << 1) & as) | (a & 1);
+
+ // compute scaling factor
+ float ss = max / float(as);
+
+ // rounded float->int
+ int rf = int(float(r) * ss + 0.5f);
+ int gf = int(float(g) * ss + 0.5f);
+ int bf = int(float(b) * ss + 0.5f);
+ int af = int(float(a) * ss + 0.5f);
+
+ data[i * 4 + 0] = T(rf);
+ data[i * 4 + 1] = T(gf);
+ data[i * 4 + 2] = T(bf);
+ data[i * 4 + 3] = T(af);
+ }
+}
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
@@ -201,7 +247,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
#endif
#ifdef SIMD_SSE
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
@@ -246,7 +292,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
}
}
-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
@@ -295,8 +341,9 @@ static void decodeFilterOctSimd(short* data, size_t count)
__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
// patch in .w
- res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
- res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+ __m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0);
+ res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw));
+ res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
@@ -305,7 +352,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
- const float scale = 1.f / sqrtf(2.f);
+ const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -324,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
- __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+ __m128 s = _mm_cvtepi32_ps(sf);
- // convert x/y/z to [-1..1] (scaled...)
- __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
- __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
- __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+ // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+ __m128 x = _mm_cvtepi32_ps(xf);
+ __m128 y = _mm_cvtepi32_ps(yf);
+ __m128 z = _mm_cvtepi32_ps(zf);
- // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
- __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+ // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+ __m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
+ __m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
- __m128 s = _mm_set1_ps(32767.f);
+ // compute final scale; note that all computations above are unscaled
+ // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+ __m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
// rounded signed float->int
- __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
- __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
- __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
- __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+ __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
+ __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
+ __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
+ __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
// mix x/z and w/y to make 16-bit unpack easier
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -385,6 +435,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
_mm_storeu_ps(reinterpret_cast(&data[i]), r);
}
}
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+ // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+ __m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
+ __m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
+ __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
+ __m128i af = _mm_srli_epi32(c4, 24);
+
+ // recover scale from alpha high bit
+ __m128i as = af;
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+
+ // expand alpha by one bit to match other components
+ af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+ // compute scaling factor
+ __m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
+
+ // convert to RGB in fixed point
+ __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+ __m128i gf = _mm_add_epi32(yf, cgf);
+ __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+ // rounded signed float->int
+ __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+ __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+ __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+ __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+ // repack rgba into final value
+ __m128i res = rr;
+ res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
+ res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
+ res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+ }
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
+ __m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
+
+ // gather both y/co 16-bit pairs in each 32-bit lane
+ __m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
+
+ // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+ __m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
+ __m128i cof = _mm_srai_epi32(c4_yco, 16);
+ __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
+ __m128i af = _mm_srli_epi32(c4_cga, 16);
+
+ // recover scale from alpha high bit
+ __m128i as = af;
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+ as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
+
+ // expand alpha by one bit to match other components
+ af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+ // compute scaling factor
+ __m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
+
+ // convert to RGB in fixed point
+ __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+ __m128i gf = _mm_add_epi32(yf, cgf);
+ __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+ // rounded signed float->int
+ __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+ __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+ __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+ __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+ // mix r/b and g/a to make 16-bit unpack easier
+ __m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
+ __m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
+
+ // pack r/g/b/a using 16-bit unpacks
+ __m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
+ __m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+ }
+}
#endif
#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
@@ -401,10 +550,17 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
return vmulq_f32(x, r);
}
+
+#ifndef __ARM_FEATURE_FMA
+inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
+{
+ return vaddq_f32(x, vmulq_f32(y, z));
+}
+#endif
#endif
#ifdef SIMD_NEON
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
@@ -431,29 +587,27 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
- float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+ float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
float32x4_t rl = vrsqrteq_f32(ll);
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
- // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
- int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
- int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
- int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+ int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+ int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+ int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
// combine xr/yr/zr into final value
- int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
- res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
- res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
- res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+ int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
+ res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
vst1q_s32(reinterpret_cast(&data[i * 4]), res);
}
}
-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
@@ -485,21 +639,25 @@ static void decodeFilterOctSimd(short* data, size_t count)
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
- float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+ float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
+#if !defined(__aarch64__) && !defined(_M_ARM64)
float32x4_t rl = vrsqrteq_f32(ll);
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+#else
+ float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
+#endif
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
- int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
- int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
- int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+ int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+ int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+ int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
// mix x/z and y/0 to make 16-bit unpack easier
- int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+ int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
@@ -517,7 +675,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
- const float scale = 1.f / sqrtf(2.f);
+ const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -536,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
- float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+ float32x4_t s = vcvtq_f32_s32(sf);
- // convert x/y/z to [-1..1] (scaled...)
- float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
- float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
- float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+ // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+ float32x4_t x = vcvtq_f32_s32(xf);
+ float32x4_t y = vcvtq_f32_s32(yf);
+ float32x4_t z = vcvtq_f32_s32(zf);
- // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
- float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+ // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+ float32x4_t ws = vmulq_f32(s, s);
+ float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
- float32x4_t s = vdupq_n_f32(32767.f);
+ // compute final scale; note that all computations above are unscaled
+ // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+ float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
- int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
- int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
- int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
- int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+ int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
+ int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
+ int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
+ int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
// mix x/z and w/y to make 16-bit unpack easier
- int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
- int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+ int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
+ int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
- int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
- int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+ uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+ uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+ // store results to stack so that we can rotate using scalar instructions
+ // TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
+ volatile uint64_t res[4];
+ vst1q_u64(const_cast(&res[0]), res_0);
+ vst1q_u64(const_cast(&res[2]), res_1);
// rotate and store
- uint64_t* out = (uint64_t*)&data[i * 4];
+ uint64_t* out = reinterpret_cast(&data[i * 4]);
- out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
- out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
- out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
- out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+ out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+ out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+ out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+ out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
}
}
@@ -595,10 +762,112 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
vst1q_f32(reinterpret_cast(&data[i]), r);
}
}
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t c4 = vld1q_s32(reinterpret_cast(&data[i * 4]));
+
+ // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+ int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
+ int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
+ int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
+ int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
+
+ // recover scale from alpha high bit
+ int32x4_t as = af;
+ as = vorrq_s32(as, vshrq_n_s32(as, 1));
+ as = vorrq_s32(as, vshrq_n_s32(as, 2));
+ as = vorrq_s32(as, vshrq_n_s32(as, 4));
+
+ // expand alpha by one bit to match other components
+ af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
+
+ // compute scaling factor
+ float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
+
+ // convert to RGB in fixed point
+ int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
+ int32x4_t gf = vaddq_s32(yf, cgf);
+ int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+ const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+ int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
+ int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
+ int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
+ int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
+
+ // repack rgba into final value
+ int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
+
+ vst1q_s32(reinterpret_cast(&data[i * 4]), res);
+ }
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t c4_0 = vld1q_s32(reinterpret_cast