diff --git a/Content/Editor/Camera/M_Camera.flax b/Content/Editor/Camera/M_Camera.flax
index 7d7213a8d..7d4c71666 100644
--- a/Content/Editor/Camera/M_Camera.flax
+++ b/Content/Editor/Camera/M_Camera.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:7edc1b9d2c7fbd32fcf778814deb719c71781f657da050ac0c7c78984aeb360d
+oid sha256:b73d774c71bd7b46c9c4198a4c957055e6447e31d8252813b272db92301475e7
size 29533
diff --git a/Content/Editor/CubeTexturePreviewMaterial.flax b/Content/Editor/CubeTexturePreviewMaterial.flax
index 5969c90fa..2d732c086 100644
--- a/Content/Editor/CubeTexturePreviewMaterial.flax
+++ b/Content/Editor/CubeTexturePreviewMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ac6023e5d6525c3b7c385a380ed9d6fc71ec9d683c587391d14c9daf6653e31a
+oid sha256:c4ec07a3b7e0a2dfd4332598a982c3192c0c357c6bcd128d7a7797fb483780e7
size 31445
diff --git a/Content/Editor/DebugMaterials/DDGIDebugProbes.flax b/Content/Editor/DebugMaterials/DDGIDebugProbes.flax
index fc45d33cc..d082bd8e7 100644
--- a/Content/Editor/DebugMaterials/DDGIDebugProbes.flax
+++ b/Content/Editor/DebugMaterials/DDGIDebugProbes.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ecd573c40f534f293d4827b1a8150d439d4f5e7729552474926208c5814f3d3e
+oid sha256:2830919bea988e1f8bd8299ceac34b8a3695418e2f22ca670f2fec3b3d6d1a2f
size 41149
diff --git a/Content/Editor/DebugMaterials/SingleColor/Decal.flax b/Content/Editor/DebugMaterials/SingleColor/Decal.flax
index 05e99be76..b94f22bc8 100644
--- a/Content/Editor/DebugMaterials/SingleColor/Decal.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Decal.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c0b2ad25738c2bc55bb3e76fc94fc81992b1d65b8b3091b132c75b2ed064c517
-size 10398
+oid sha256:588c29a4b239c32d4b125052e4054a29cf5140562e90ca6fac4d2952e03f66c7
+size 10397
diff --git a/Content/Editor/DebugMaterials/SingleColor/Particle.flax b/Content/Editor/DebugMaterials/SingleColor/Particle.flax
index 7a328e7a0..de2043874 100644
--- a/Content/Editor/DebugMaterials/SingleColor/Particle.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Particle.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:02ddea5bcb3fccb697081e47cc26a0b546b23b89ceca299e702a1d431775dfd6
+oid sha256:b39cd76254f341c93e83625475b6e7896ef34f1d6d650da52e649bc055d0d03e
size 33503
diff --git a/Content/Editor/DebugMaterials/SingleColor/Surface.flax b/Content/Editor/DebugMaterials/SingleColor/Surface.flax
index 84e05ee36..7ae8a69c3 100644
--- a/Content/Editor/DebugMaterials/SingleColor/Surface.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Surface.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:741a7619b5aebc6c7c6a573a0407e8b7aa42d1b50d0ed5cf6a21026932807d0e
+oid sha256:5861e912cf822c9478f824390f6258d84821b7289e3e993a7dee38b77c5a2f80
size 29398
diff --git a/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax b/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax
index ab4591176..fdcb880df 100644
--- a/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:358370943d21a97f8b45ff2181b7c6c2d7a6297e3f166ae7a77363aadf89b152
+oid sha256:b9ed2869a2a754423e0b8c456eed621bd06bdb50cacf7a972a7f024e40a1ea6a
size 32954
diff --git a/Content/Editor/DebugMaterials/SingleColor/Terrain.flax b/Content/Editor/DebugMaterials/SingleColor/Terrain.flax
index 54151179a..ad27a422c 100644
--- a/Content/Editor/DebugMaterials/SingleColor/Terrain.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Terrain.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:486b4db3e1d825d026753d944a04defe4d72eb73eb03a438944c366f19de824e
+oid sha256:05c27ac416ef922ee247adc12a115fd522eb3a1d8873e1056914cd96893a3097
size 21096
diff --git a/Content/Editor/DefaultFontMaterial.flax b/Content/Editor/DefaultFontMaterial.flax
index 8d48c5827..d84425aab 100644
--- a/Content/Editor/DefaultFontMaterial.flax
+++ b/Content/Editor/DefaultFontMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ebdfc478caabc84a3a75384a64d262d2d509bbac3540eea462e45911719c288f
+oid sha256:8e3d4ca149e143fee31e2d038b8efec526ca995dbe13258fbb68c89cd43ecbf7
size 29627
diff --git a/Content/Editor/Gizmo/FoliageBrushMaterial.flax b/Content/Editor/Gizmo/FoliageBrushMaterial.flax
index 79385ada6..eb7e784c9 100644
--- a/Content/Editor/Gizmo/FoliageBrushMaterial.flax
+++ b/Content/Editor/Gizmo/FoliageBrushMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:aa4f1a733150c62064cac60c07980df7c84bb6163dc9507782aa98df07f48874
+oid sha256:7af1150d6e7cb6ecce5cd039f0edc92967c986a13903a201d6dc15ed0751dc57
size 39637
diff --git a/Content/Editor/Gizmo/Material.flax b/Content/Editor/Gizmo/Material.flax
index ace3bde90..bbb114662 100644
--- a/Content/Editor/Gizmo/Material.flax
+++ b/Content/Editor/Gizmo/Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:26e1832496c01cb31bd4dc9000d3cd326ea4fd54de02910d3801d2641bff685c
+oid sha256:d575ca1b202c84b8268687b391be5fc8d55497ffa23fb3cd4287fa667de654ab
size 34240
diff --git a/Content/Editor/Gizmo/MaterialWire.flax b/Content/Editor/Gizmo/MaterialWire.flax
index 7ea0a596f..fb4b8acca 100644
--- a/Content/Editor/Gizmo/MaterialWire.flax
+++ b/Content/Editor/Gizmo/MaterialWire.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:ca8bc1ac9d45534d3efd3b4308d7492fa016726b4ec744be26619069ce911b73
+oid sha256:26f2d88aab9c0cad36ae527b038a36b69755ff3a5a920e8c4563dd5e1ed8ec65
size 32689
diff --git a/Content/Editor/Gizmo/SelectionOutlineMaterial.flax b/Content/Editor/Gizmo/SelectionOutlineMaterial.flax
index 0c1461b72..b5d224d58 100644
--- a/Content/Editor/Gizmo/SelectionOutlineMaterial.flax
+++ b/Content/Editor/Gizmo/SelectionOutlineMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:09f7dff17af9cd055352e0da534f3466c8efa235c40faf5e56da92c788342f6a
-size 17394
+oid sha256:5bb75934622d9251a8a9e72cfe4905091770798ffed22de680a70f98434d0ed7
+size 16241
diff --git a/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax b/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax
index bd4935d96..5a5262e2b 100644
--- a/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax
+++ b/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:1bc0005c64c561a430a17e4707abc000e06498af968890e2c4e223dc07f07c12
+oid sha256:a1afa76c3f9400da065c150a6a58adc904c3596f650e04dfd87b5e1c1b34695e
size 30655
diff --git a/Content/Editor/Highlight Material.flax b/Content/Editor/Highlight Material.flax
index ccecb98aa..9d09ea792 100644
--- a/Content/Editor/Highlight Material.flax
+++ b/Content/Editor/Highlight Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:95d172cd12bb3c818fbccf737e78ab282bc8d0880aa8f45af0562850b0eabe4b
-size 31616
+oid sha256:1290ae85e4fe41f9d8c1919b33e165287f79377aeddc68f9117c1795ca341003
+size 31267
diff --git a/Content/Editor/Icons/IconsMaterial.flax b/Content/Editor/Icons/IconsMaterial.flax
index b24941463..2ccbce8c9 100644
--- a/Content/Editor/Icons/IconsMaterial.flax
+++ b/Content/Editor/Icons/IconsMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:5ca4baa1419080395dcf2b5757676406288f112754bc3cd2f27610b58d199622
+oid sha256:340cc500a160344b43b21ed8c4c22b6d776f406581f606ced62a3e92c5bef18a
size 31300
diff --git a/Content/Editor/IesProfilePreviewMaterial.flax b/Content/Editor/IesProfilePreviewMaterial.flax
index 99bc2662c..b3a382132 100644
--- a/Content/Editor/IesProfilePreviewMaterial.flax
+++ b/Content/Editor/IesProfilePreviewMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:b3b4c61b04d372d2430a7c08dec612af6caa0e57b1cb47ea44d171d729d3f8f8
+oid sha256:d444cd33ec8d2e1e0e6651c3979260f05c06c8bac33ce2441d6974ae4fa178e4
size 20443
diff --git a/Content/Editor/Particles/Particle Material Color.flax b/Content/Editor/Particles/Particle Material Color.flax
index 19eb7a3c2..91b06b2fb 100644
--- a/Content/Editor/Particles/Particle Material Color.flax
+++ b/Content/Editor/Particles/Particle Material Color.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:6f3b8a7c48c55e33a41f9fe4dbf9b3109b0e734ff154d6cbd3e4101013b01649
+oid sha256:906443c7db821361b32780c17735bc9477ea96c8979dee371a4899635246af48
size 31708
diff --git a/Content/Editor/Particles/Smoke Material.flax b/Content/Editor/Particles/Smoke Material.flax
index 527d19842..e6396c194 100644
--- a/Content/Editor/Particles/Smoke Material.flax
+++ b/Content/Editor/Particles/Smoke Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:2275282d4e3b5e012a0bbc93fca0d6ffdad89e5a5f0c289678f70748f2efab56
-size 40655
+oid sha256:16db9c1a18b64aea2dcdf3e74f9a44c652bf8bd9b33a5bfda39555d8c002a358
+size 39774
diff --git a/Content/Editor/SpriteMaterial.flax b/Content/Editor/SpriteMaterial.flax
index d967a4ea4..2a05418b2 100644
--- a/Content/Editor/SpriteMaterial.flax
+++ b/Content/Editor/SpriteMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:6f5e82be7efa6489cfdfd1babeb1fbb90507aaff7c04eb5f64a4971adf0a2164
+oid sha256:56254b02ffc937d61e8e8fa6492d4805e944ca639c7fcfc0f751b4ac2442365d
size 30734
diff --git a/Content/Editor/Terrain/Circle Brush Material.flax b/Content/Editor/Terrain/Circle Brush Material.flax
index 6ddc5f3e9..f481be389 100644
--- a/Content/Editor/Terrain/Circle Brush Material.flax
+++ b/Content/Editor/Terrain/Circle Brush Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:2c7fde7be7d6f9876f9c0db02632c098ab95ade7de57c583d2e495c8ae8665bd
+oid sha256:16eefa75a2ae99bba658c4e9b8e8741187b90e577193f76394872764fff2ca0b
size 28232
diff --git a/Content/Editor/Terrain/Highlight Terrain Material.flax b/Content/Editor/Terrain/Highlight Terrain Material.flax
index c573eb3ee..579db477c 100644
--- a/Content/Editor/Terrain/Highlight Terrain Material.flax
+++ b/Content/Editor/Terrain/Highlight Terrain Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:9369a554ea1776154f5e39d4aaed044f928d98f1f5955b7590b0972015b07438
+oid sha256:e25a3c9e130e51b28dfe5ce43678f52c277c0def83142a2853c4c8ca84dbf417
size 21179
diff --git a/Content/Editor/TexturePreviewMaterial.flax b/Content/Editor/TexturePreviewMaterial.flax
index 2c91f9d8f..d75e19d5e 100644
--- a/Content/Editor/TexturePreviewMaterial.flax
+++ b/Content/Editor/TexturePreviewMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:4d61f178e72e4d983a919b76368e03c66995ecf50935f6f55b660e34f58755a2
+oid sha256:79de09ba0616eb6066171c2b80cdb6c4235cb52be4836d23162bb9c2585760a0
size 11058
diff --git a/Content/Editor/Wires Debug Material.flax b/Content/Editor/Wires Debug Material.flax
index 308a6230a..b1f87a7d0 100644
--- a/Content/Editor/Wires Debug Material.flax
+++ b/Content/Editor/Wires Debug Material.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c7a42b1bc5a34f9c47d1aeb773ef26ce470b2d88c2b092828f0fcb439583ef27
-size 31616
+oid sha256:02d4c767fb59c67fef16ccc081f6f371bad329a5333047f9f79fd3d50b911f93
+size 31753
diff --git a/Content/Engine/DefaultDeformableMaterial.flax b/Content/Engine/DefaultDeformableMaterial.flax
index a397d1ad8..1244ae3ec 100644
--- a/Content/Engine/DefaultDeformableMaterial.flax
+++ b/Content/Engine/DefaultDeformableMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:be21bb7eecd9c774196dbaa89d1b049b108fc0929d648795056c977fe00364ab
-size 19582
+oid sha256:d1f556b230cea8e83d00bd4357d34a77e5e468389a5f3bb615e30f6a3ce3ace4
+size 19734
diff --git a/Content/Engine/DefaultMaterial.flax b/Content/Engine/DefaultMaterial.flax
index eddcbace8..bd57e7d44 100644
--- a/Content/Engine/DefaultMaterial.flax
+++ b/Content/Engine/DefaultMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0a8a4ad5e763704263b94a7a7e0cc30ab7b1cd1abcb5ccae2d4c6062a65920df
-size 31928
+oid sha256:c4ec872b3433d58f8aed640c6efee3d911f226740b4844cb07ed0bf94c00ea18
+size 32080
diff --git a/Content/Engine/DefaultRadialMenu.flax b/Content/Engine/DefaultRadialMenu.flax
index 60e2ba5f9..5fba9092e 100644
--- a/Content/Engine/DefaultRadialMenu.flax
+++ b/Content/Engine/DefaultRadialMenu.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c4151a58e5314937efcd3bdcb9fe0bdd5047b8705931e45e0a4e71a4470e16a0
+oid sha256:0da99403c069966d05daea7fc11d32f20f88bac0463fbc08724840e249ee3bd2
size 21700
diff --git a/Content/Engine/DefaultTerrainMaterial.flax b/Content/Engine/DefaultTerrainMaterial.flax
index b302ade35..4147fe0e4 100644
--- a/Content/Engine/DefaultTerrainMaterial.flax
+++ b/Content/Engine/DefaultTerrainMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c5cf6924809b9bd7ad3c09722a93f327a0d111676060d136df9c14ab34e8475b
-size 23930
+oid sha256:bdfa3b4842a5734d2cd8110af03599b4a5280b33a72b2ba435cd19487cebcde6
+size 24082
diff --git a/Content/Engine/SingleColorMaterial.flax b/Content/Engine/SingleColorMaterial.flax
index d6d179150..6d556af2b 100644
--- a/Content/Engine/SingleColorMaterial.flax
+++ b/Content/Engine/SingleColorMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:750f69ce59ef020d2e2186ed6c4bf7aac67ecb1692287e358eaed969fc36381a
+oid sha256:6ff8f127d46e68e3423339a352f623c079f2c5d93512c5e9b25841edc7cd0f05
size 29615
diff --git a/Content/Engine/SkyboxMaterial.flax b/Content/Engine/SkyboxMaterial.flax
index cc369ceee..b51c5bce7 100644
--- a/Content/Engine/SkyboxMaterial.flax
+++ b/Content/Engine/SkyboxMaterial.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3eecc9556af6c2a79d39a7e1c52e4019bdccfb43b074eaddd18600a5854dbffe
+oid sha256:14c9833ed19302ea7c6e730fff63f1b72dbac71dc2b49c1d62edb61ccaa68b6f
size 31974
diff --git a/Content/Shaders/GI/DDGI.flax b/Content/Shaders/GI/DDGI.flax
index 6739b2436..257953bf9 100644
--- a/Content/Shaders/GI/DDGI.flax
+++ b/Content/Shaders/GI/DDGI.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:5577ef4ce821b08a38afe17b9e5d11cb0b409eb05dd89b2ca76ea95d88085dc0
-size 32893
+oid sha256:5b017cf857f443553020e4bc7c8c8c5da3a826a2514322664a023ffa6005f7a5
+size 38217
diff --git a/Content/Shaders/GI/GlobalSurfaceAtlas.flax b/Content/Shaders/GI/GlobalSurfaceAtlas.flax
index 1b0173ba5..57990c249 100644
--- a/Content/Shaders/GI/GlobalSurfaceAtlas.flax
+++ b/Content/Shaders/GI/GlobalSurfaceAtlas.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0f34bf867df5f4296ca66ac691c2bca4efa168fb9e21ca4e613e8086669575cf
-size 13296
+oid sha256:615dff65b01507be6c4de722e126324aba20fc197f8e12dafaa94a05e46cba6e
+size 13222
diff --git a/Content/Shaders/GlobalSignDistanceField.flax b/Content/Shaders/GlobalSignDistanceField.flax
index 590e8f3a9..5afcb4bf4 100644
--- a/Content/Shaders/GlobalSignDistanceField.flax
+++ b/Content/Shaders/GlobalSignDistanceField.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:064f54786958f109222c49cbc0358ff4f345b30010fcd5e8cc1fab7bdc68c4fe
-size 13349
+oid sha256:1f07ebb16820897e8598ae7a0627cb75b3d28e9dceea3ad4bd9ff543d5cdd01c
+size 13979
diff --git a/Flax.flaxproj b/Flax.flaxproj
index 100d4e9ff..74ab82f55 100644
--- a/Flax.flaxproj
+++ b/Flax.flaxproj
@@ -4,7 +4,7 @@
"Major": 1,
"Minor": 11,
"Revision": 0,
- "Build": 6805
+ "Build": 6806
},
"Company": "Flax",
"Copyright": "Copyright (c) 2012-2025 Wojciech Figat. All rights reserved.",
diff --git a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
index 954599347..9844f3fda 100644
--- a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
+++ b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
@@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated
settingsButton.Tag = script;
settingsButton.Clicked += OnSettingsButtonClicked;
- group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2);
+ // Adjust margin to not overlap with other ui elements in the header
+ group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin };
group.Object(values, editor);
// Remove drop down arrows and containment lines if no objects in the group
if (group.Children.Count == 0)
diff --git a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
index b3fff5644..28593a7f5 100644
--- a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
+++ b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
@@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors
protected bool NotNullItems;
private IntValueBox _sizeBox;
+ private Label _label;
private Color _background;
private int _elementsCount, _minCount, _maxCount;
private bool _readOnly;
@@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors
Parent = dropPanel,
};
- var label = new Label
+ _label = new Label
{
Text = "Size",
AnchorPreset = AnchorPresets.TopRight,
@@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors
Resize(Count + 1);
};
}
- }
+ Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged;
+ }
+
private void OnSetupContextMenu(ContextMenu menu, DropPanel panel)
{
if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal)))
@@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors
});
}
+ private void OnLayoutSizeChanged(Control control)
+ {
+ if (Layout.ContainerControl is DropPanel dropPanel)
+ {
+ // Hide "Size" text when array editor title overlaps
+ var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText);
+ if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left)
+ _label.TextColor = _label.TextColorHighlighted = Color.Transparent;
+ else
+ _label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground;
+ }
+ }
+
///
protected override void Deinitialize()
{
_sizeBox = null;
+ Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged;
base.Deinitialize();
}
diff --git a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
index 64bc9080b..055c6a29d 100644
--- a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
+++ b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
@@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements
{
var style = Style.Current;
var settingsButtonSize = Panel.HeaderHeight;
- return new Image
+ Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin };
+; return new Image
{
TooltipText = "Settings",
AutoFocus = true,
diff --git a/Source/Editor/GUI/Input/ValueBox.cs b/Source/Editor/GUI/Input/ValueBox.cs
index 674ee0697..88ec9a4ee 100644
--- a/Source/Editor/GUI/Input/ValueBox.cs
+++ b/Source/Editor/GUI/Input/ValueBox.cs
@@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input
///
public event Action SlidingEnd;
+ ///
+ /// If enabled, pressing the arrow up or down key increments/ decrements the value.
+ ///
+ public bool ArrowKeysIncrement = true;
+
///
/// Gets or sets the slider speed. Use value 0 to disable and hide slider UI.
///
@@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input
ResetViewOffset();
}
+ ///
+ public override bool OnKeyDown(KeyboardKeys key)
+ {
+ if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown))
+ {
+ bool altDown = Root.GetKey(KeyboardKeys.Alt);
+ bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
+ bool controlDown = Root.GetKey(KeyboardKeys.Control);
+ float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f));
+ float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
+
+ _startSlideValue = Value;
+ ApplySliding(slideDelta);
+ EndSliding();
+ Focus();
+ return true;
+ }
+
+ return base.OnKeyDown(key);
+ }
+
///
public override bool OnMouseDown(Float2 location, MouseButton button)
{
diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs
index c02992041..8a678370a 100644
--- a/Source/Editor/Modules/UIModule.cs
+++ b/Source/Editor/Modules/UIModule.cs
@@ -125,6 +125,7 @@ namespace FlaxEditor.Modules
private ContextMenuButton _menuToolsProfilerWindow;
private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault;
private ContextMenuButton _menuToolsTakeScreenshot;
+ private ContextMenuButton _menuToolsOpenLocalFolder;
private ContextMenuChildMenu _menuWindowApplyWindowLayout;
private ToolStripButton _toolStripSaveAll;
@@ -725,6 +726,16 @@ namespace FlaxEditor.Modules
_menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot);
cm.AddSeparator();
cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
+ cm.AddSeparator();
+ var childMenu = cm.AddChildMenu("Open Product Local folder");
+ childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
+ _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () =>
+ {
+ string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+ GameSettings settings = GameSettings.Load();
+ string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+ FileSystem.ShowFileExplorer(path);
+ });
// Window
MenuWindow = MainMenu.AddButton("Window");
@@ -1062,6 +1073,10 @@ namespace FlaxEditor.Modules
_menuToolsBuildNavMesh.Enabled = canEdit;
_menuToolsCancelBuilding.Enabled = GameCooker.IsRunning;
_menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0;
+ string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+ GameSettings settings = GameSettings.Load();
+ string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+ _menuToolsOpenLocalFolder.Enabled = Directory.Exists(path);
c.PerformLayout();
}
diff --git a/Source/Editor/Options/InputOptions.cs b/Source/Editor/Options/InputOptions.cs
index ab473ebed..a759b7247 100644
--- a/Source/Editor/Options/InputOptions.cs
+++ b/Source/Editor/Options/InputOptions.cs
@@ -571,6 +571,10 @@ namespace FlaxEditor.Options
[EditorDisplay("View Flags"), EditorOrder(3260)]
public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift);
+ [DefaultValue(typeof(InputBinding), "None")]
+ [EditorDisplay("View Flags"), EditorOrder(3270)]
+ public InputBinding Particles = new InputBinding(KeyboardKeys.None);
+
#endregion
#region Interface
diff --git a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
index c4fd47f71..4a7150972 100644
--- a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
+++ b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
@@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors
if (value is BoxCollider collider)
collider.AutoResize(!_keepLocalOrientation);
}
+ Presenter.OnModified();
}
}
diff --git a/Source/Editor/Utilities/ShuntingYardParser.cs b/Source/Editor/Utilities/ShuntingYardParser.cs
index 47e2275e5..fe473389c 100644
--- a/Source/Editor/Utilities/ShuntingYardParser.cs
+++ b/Source/Editor/Utilities/ShuntingYardParser.cs
@@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities
/// The result value.
public static double Parse(string text)
{
+ // Hack to allow parsing numbers while using "_" as a separator (like this: 1_000)
+ text = text.Replace("_", string.Empty);
+
var tokens = Tokenize(text);
var rpn = OrderTokens(tokens);
return EvaluateRPN(rpn);
diff --git a/Source/Editor/Viewport/EditorViewport.cs b/Source/Editor/Viewport/EditorViewport.cs
index c16d3d9f5..2af065c68 100644
--- a/Source/Editor/Viewport/EditorViewport.cs
+++ b/Source/Editor/Viewport/EditorViewport.cs
@@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport
InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog);
InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight);
InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals);
+ InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles);
InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess);
InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom);
InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping);
@@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport
new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog),
new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight),
new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals),
+ new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles),
new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess),
new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom),
new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping),
@@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport
if (cm.Visible == false)
return;
var ccm = (ContextMenu)cm;
+ var flags = Task.View.Flags;
foreach (var e in ccm.Items)
{
if (e is ContextMenuButton b && b.Tag != null)
{
var v = (ViewFlags)b.Tag;
- b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
+ b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
}
}
}
diff --git a/Source/Editor/Windows/EditorOptionsWindow.cs b/Source/Editor/Windows/EditorOptionsWindow.cs
index 0ee9a92d7..c6bf2fd16 100644
--- a/Source/Editor/Windows/EditorOptionsWindow.cs
+++ b/Source/Editor/Windows/EditorOptionsWindow.cs
@@ -45,7 +45,7 @@ namespace FlaxEditor.Windows
{
Parent = this
};
- _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save");
+ _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save.");
_saveButton.Enabled = false;
_tabs = new Tabs
@@ -104,6 +104,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = true;
_isDataDirty = true;
+ if (!Title.EndsWith('*'))
+ Title += "*";
}
}
@@ -113,6 +115,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = false;
_isDataDirty = false;
+ if (Title.EndsWith('*'))
+ Title = Title.Remove(Title.Length - 1);
}
}
diff --git a/Source/Engine/Content/Assets/Material.cpp b/Source/Engine/Content/Assets/Material.cpp
index 019fd9dd8..b4cf55d4d 100644
--- a/Source/Engine/Content/Assets/Material.cpp
+++ b/Source/Engine/Content/Assets/Material.cpp
@@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const
return false;
}
+#if USE_EDITOR
+
+void Material::GetReferences(Array& assets, Array& files) const
+{
+ ShaderAssetTypeBase::GetReferences(assets, files);
+
+ // Collect references from material graph (needs to load it)
+ if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE))
+ {
+ ScopeLock lock(Locker);
+ if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE)))
+ {
+ const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE);
+ if (surfaceChunk)
+ {
+ MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size());
+ MaterialGraph graph;
+ if (!graph.Load(&stream, false))
+ {
+ graph.GetReferences(assets);
+ }
+ }
+ }
+ }
+
+}
+
+#endif
+
const MaterialInfo& Material::GetInfo() const
{
if (_materialShader)
diff --git a/Source/Engine/Content/Assets/Material.h b/Source/Engine/Content/Assets/Material.h
index 4ce47b154..cd2ae8e97 100644
--- a/Source/Engine/Content/Assets/Material.h
+++ b/Source/Engine/Content/Assets/Material.h
@@ -38,6 +38,9 @@ public:
public:
// [MaterialBase]
bool IsMaterialInstance() const override;
+#if USE_EDITOR
+ void GetReferences(Array& assets, Array& files) const override;
+#endif
// [IMaterial]
const MaterialInfo& GetInfo() const override;
diff --git a/Source/Engine/Debug/DebugDraw.cpp b/Source/Engine/Debug/DebugDraw.cpp
index 7c798f88f..bea9e76f4 100644
--- a/Source/Engine/Debug/DebugDraw.cpp
+++ b/Source/Engine/Debug/DebugDraw.cpp
@@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de
return list->Get() + startIndex;
}
+FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest)
+{
+ Array* list;
+ if (depthTest)
+ list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles;
+ else
+ list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles;
+ const int32 startIndex = list->Count();
+ list->AddUninitialized(count);
+ return list->Get() + startIndex;
+}
+
inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer)
{
Matrix w, fw, m;
@@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& col
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
- auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+ auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Float3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin;
for (int32 i = 0; i < indices.Length();)
{
@@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const Color& co
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
- auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+ auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Double3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span& vertices, const SpanOrigin;
for (int32 i = 0; i < indices.Length();)
{
diff --git a/Source/Engine/Graphics/Enums.h b/Source/Engine/Graphics/Enums.h
index f6af6c16b..107fe3533 100644
--- a/Source/Engine/Graphics/Enums.h
+++ b/Source/Engine/Graphics/Enums.h
@@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64
///
LightsDebug = 1 << 27,
+ ///
+ /// Shows/hides particle effects.
+ ///
+ Particles = 1 << 28,
+
///
/// Default flags for Game.
///
- DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky,
+ DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles,
///
/// Default flags for Editor.
///
- DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky,
+ DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles,
///
/// Default flags for materials/models previews generating.
///
- DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky,
+ DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles,
};
DECLARE_ENUM_OPERATORS(ViewFlags);
diff --git a/Source/Engine/Graphics/Materials/MaterialShader.h b/Source/Engine/Graphics/Materials/MaterialShader.h
index 5da4ee04f..bb68520c0 100644
--- a/Source/Engine/Graphics/Materials/MaterialShader.h
+++ b/Source/Engine/Graphics/Materials/MaterialShader.h
@@ -10,7 +10,7 @@
///
/// Current materials shader version.
///
-#define MATERIAL_GRAPH_VERSION 178
+#define MATERIAL_GRAPH_VERSION 179
class Material;
class GPUShader;
diff --git a/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp b/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp
index 64dfe8303..19f2042f4 100644
--- a/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp
+++ b/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp
@@ -191,7 +191,7 @@ bool GlobalIlluminationFeature::Bind(MaterialShader::BindParameters& params, Spa
{
// Unbind SRVs to prevent issues
data.DDGI.CascadesCount = 0;
- data.DDGI.FallbackIrradiance = Float3::Zero;
+ data.DDGI.FallbackIrradiance = Float4::Zero;
params.GPUContext->UnBindSR(srv + 0);
params.GPUContext->UnBindSR(srv + 1);
params.GPUContext->UnBindSR(srv + 2);
diff --git a/Source/Engine/Graphics/Models/MeshAccessor.h b/Source/Engine/Graphics/Models/MeshAccessor.h
index 67b30e502..25fc01a1a 100644
--- a/Source/Engine/Graphics/Models/MeshAccessor.h
+++ b/Source/Engine/Graphics/Models/MeshAccessor.h
@@ -17,7 +17,7 @@ public:
///
/// Mesh data stream.
///
- struct Stream
+ struct FLAXENGINE_API Stream
{
friend MeshAccessor;
diff --git a/Source/Engine/Graphics/PostProcessSettings.h b/Source/Engine/Graphics/PostProcessSettings.h
index 670d99611..a300063e7 100644
--- a/Source/Engine/Graphics/PostProcessSettings.h
+++ b/Source/Engine/Graphics/PostProcessSettings.h
@@ -378,7 +378,7 @@ API_STRUCT() struct FLAXENGINE_API GlobalIlluminationSettings : ISerializable
/// The irradiance lighting outside the GI range used as a fallback to prevent pure-black scene outside the Global Illumination range.
///
API_FIELD(Attributes="EditorOrder(40), PostProcessSetting((int)GlobalIlluminationSettingsOverride.FallbackIrradiance)")
- Color FallbackIrradiance = Color::Black;
+ Color FallbackIrradiance = Color::Transparent;
public:
///
diff --git a/Source/Engine/Graphics/RenderTools.cpp b/Source/Engine/Graphics/RenderTools.cpp
index b0d587c8d..effbe6e1b 100644
--- a/Source/Engine/Graphics/RenderTools.cpp
+++ b/Source/Engine/Graphics/RenderTools.cpp
@@ -620,6 +620,40 @@ void RenderTools::ComputeSphereModelDrawMatrix(const RenderView& view, const Flo
resultIsViewInside = Float3::DistanceSquared(view.Position, position) < Math::Square(radius * 1.1f); // Manually tweaked bias
}
+Float3 RenderTools::GetColorQuantizationError(PixelFormat format)
+{
+ Float3 mantissaBits;
+ switch (format)
+ {
+ case PixelFormat::R11G11B10_Float:
+ mantissaBits = Float3(6, 6, 5);
+ break;
+ case PixelFormat::R10G10B10A2_UNorm:
+ mantissaBits = Float3(10, 10, 10);
+ break;
+ case PixelFormat::R16G16B16A16_Float:
+ mantissaBits = Float3(16, 16, 16);
+ break;
+ case PixelFormat::R32G32B32A32_Float:
+ mantissaBits = Float3(23, 23, 23);
+ break;
+ case PixelFormat::R9G9B9E5_SharedExp:
+ mantissaBits = Float3(5, 6, 5);
+ break;
+ case PixelFormat::R8G8B8A8_UNorm:
+ case PixelFormat::B8G8R8A8_UNorm:
+ mantissaBits = Float3(8, 8, 8);
+ break;
+ default:
+ return Float3::Zero;
+ }
+ return {
+ Math::Pow(0.5f, mantissaBits.X),
+ Math::Pow(0.5f, mantissaBits.Y),
+ Math::Pow(0.5f, mantissaBits.Z)
+ };
+}
+
int32 MipLevelsCount(int32 width)
{
int32 result = 1;
diff --git a/Source/Engine/Graphics/RenderTools.h b/Source/Engine/Graphics/RenderTools.h
index 18357a13a..5f0dc23dc 100644
--- a/Source/Engine/Graphics/RenderTools.h
+++ b/Source/Engine/Graphics/RenderTools.h
@@ -140,6 +140,9 @@ public:
static void CalculateTangentFrame(Float3& resultNormal, Float4& resultTangent, const Float3& normal, const Float3& tangent);
static void ComputeSphereModelDrawMatrix(const RenderView& view, const Float3& position, float radius, Matrix& resultWorld, bool& resultIsViewInside);
+
+ // Calculates error for a given render target format to reduce floating-point precision artifacts via QuantizeColor (from Noise.hlsl).
+ static Float3 GetColorQuantizationError(PixelFormat format);
};
// Calculates mip levels count for a texture 1D.
diff --git a/Source/Engine/Input/Input.cpp b/Source/Engine/Input/Input.cpp
index 8438977b1..7048140ef 100644
--- a/Source/Engine/Input/Input.cpp
+++ b/Source/Engine/Input/Input.cpp
@@ -80,6 +80,8 @@ Delegate Input::MouseDoubleClick;
Delegate Input::MouseWheel;
Delegate Input::MouseMove;
Action Input::MouseLeave;
+Delegate Input::GamepadButtonDown;
+Delegate Input::GamepadButtonUp;
Delegate Input::TouchDown;
Delegate Input::TouchMove;
Delegate Input::TouchUp;
@@ -1027,6 +1029,19 @@ void InputService::Update()
break;
}
}
+ // TODO: route gamepad button events into global InputEvents queue to improve processing
+ for (int32 i = 0; i < Input::Gamepads.Count(); i++)
+ {
+ auto gamepad = Input::Gamepads[i];
+ for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++)
+ {
+ GamepadButton button = (GamepadButton)buttonIdx;
+ if (gamepad->GetButtonDown(button))
+ Input::GamepadButtonDown((InputGamepadIndex)i, button);
+ else if (gamepad->GetButtonUp(button))
+ Input::GamepadButtonUp((InputGamepadIndex)i, button);
+ }
+ }
// Update all actions
for (int32 i = 0; i < Input::ActionMappings.Count(); i++)
diff --git a/Source/Engine/Input/Input.h b/Source/Engine/Input/Input.h
index 8cc1b2106..73e87f5f0 100644
--- a/Source/Engine/Input/Input.h
+++ b/Source/Engine/Input/Input.h
@@ -113,6 +113,16 @@ public:
///
API_EVENT() static Action MouseLeave;
+ ///
+ /// Event fired when gamepad button goes down.
+ ///
+ API_EVENT() static Delegate GamepadButtonDown;
+
+ ///
+ /// Event fired when gamepad button goes up.
+ ///
+ API_EVENT() static Delegate GamepadButtonUp;
+
///
/// Event fired when touch action begins.
///
diff --git a/Source/Engine/Particles/ParticleEffect.cpp b/Source/Engine/Particles/ParticleEffect.cpp
index 6e94594b0..9592147a7 100644
--- a/Source/Engine/Particles/ParticleEffect.cpp
+++ b/Source/Engine/Particles/ParticleEffect.cpp
@@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const
void ParticleEffect::Draw(RenderContext& renderContext)
{
- if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas)
+ if (renderContext.View.Pass == DrawPass::GlobalSDF ||
+ renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas ||
+ EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles))
return;
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition));
RenderContextBatch renderContextBatch(renderContext);
@@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext)
void ParticleEffect::Draw(RenderContextBatch& renderContextBatch)
{
+ const RenderView& mainView = renderContextBatch.GetMainContext().View;
+ if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles))
+ return;
Particles::DrawParticles(renderContextBatch, this);
// Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections
- const RenderView& mainView = renderContextBatch.GetMainContext().View;
const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius);
if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds))
return;
diff --git a/Source/Engine/Physics/Colliders/BoxCollider.cpp b/Source/Engine/Physics/Colliders/BoxCollider.cpp
index 1e90cb91f..47e551b37 100644
--- a/Source/Engine/Physics/Colliders/BoxCollider.cpp
+++ b/Source/Engine/Physics/Colliders/BoxCollider.cpp
@@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value)
void BoxCollider::AutoResize(bool globalOrientation = true)
{
Actor* parent = GetParent();
- if (Cast(parent))
+ if (parent == nullptr || Cast(parent))
return;
// Get bounds of all siblings (excluding itself)
const Vector3 parentScale = parent->GetScale();
if (parentScale.IsAnyZero())
- return; // Avoid division by zero
+ return;
- // Hacky way to get unrotated bounded box of parent.
+ // Hacky way to get unrotated bounded box of parent
const Quaternion parentOrientation = parent->GetOrientation();
parent->SetOrientation(Quaternion::Identity);
BoundingBox parentBox = parent->GetBox();
diff --git a/Source/Engine/Platform/Windows/WindowsPlatform.cpp b/Source/Engine/Platform/Windows/WindowsPlatform.cpp
index e40ae1ca7..42accf298 100644
--- a/Source/Engine/Platform/Windows/WindowsPlatform.cpp
+++ b/Source/Engine/Platform/Windows/WindowsPlatform.cpp
@@ -543,7 +543,6 @@ void WindowsPlatform::ReleaseMutex()
}
}
-PRAGMA_DISABLE_OPTIMIZATION;
void CheckInstructionSet()
{
#if PLATFORM_ARCH_X86 || PLATFORM_ARCH_X64
diff --git a/Source/Engine/Renderer/ColorGradingPass.cpp b/Source/Engine/Renderer/ColorGradingPass.cpp
index d6e164622..c0b40d3f6 100644
--- a/Source/Engine/Renderer/ColorGradingPass.cpp
+++ b/Source/Engine/Renderer/ColorGradingPass.cpp
@@ -37,8 +37,45 @@ GPU_CB_STRUCT(Data {
Float3 Dummy;
float LutWeight;
+
+ void Init(const PostProcessSettings& settings, GPUTexture*& lut)
+ {
+ Dummy = Float2::Zero;
+ auto& toneMapping = settings.ToneMapping;
+ auto& colorGrading = settings.ColorGrading;
+ // White Balance
+ WhiteTemp = toneMapping.WhiteTemperature;
+ WhiteTint = toneMapping.WhiteTint;
+ // Shadows
+ ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
+ ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
+ ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
+ ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
+ ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
+ ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
+ // Midtones
+ ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
+ ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
+ ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
+ ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
+ ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
+ // Highlights
+ ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
+ ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
+ ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
+ ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
+ ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
+ ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
+ //
+ Texture* lutTexture = colorGrading.LutTexture.Get();
+ const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
+ LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
+ lut = useLut ? lutTexture->GetTexture() : nullptr;
+ }
});
+Data DefaultData;
+
// Custom render buffer for caching Color Grading LUT.
class ColorGradingCustomBuffer : public RenderBuffers::CustomBuffer
{
@@ -46,7 +83,7 @@ public:
GPUTexture* LUT = nullptr;
Data CachedData;
ToneMappingMode Mode = ToneMappingMode::None;
- Texture* LutTexture = nullptr;
+ GPUTexture* LutTexture = nullptr;
#if COMPILE_WITH_DEV_ENV
uint64 FrameRendered = 0;
#endif
@@ -82,6 +119,9 @@ bool ColorGradingPass::Init()
#if COMPILE_WITH_DEV_ENV
_shader.Get()->OnReloading.Bind(this);
#endif
+ PostProcessSettings defaultSettings;
+ GPUTexture* defaultLut;
+ DefaultData.Init(defaultSettings, defaultLut);
return false;
}
@@ -125,6 +165,18 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
{
PROFILE_CPU();
+ // Prepare the parameters
+ Data data;
+ GPUTexture* lutTexture;
+ auto& toneMapping = renderContext.List->Settings.ToneMapping;
+ data.Init(renderContext.List->Settings, lutTexture);
+
+ // Skip if color grading is unsued
+ if (Platform::MemoryCompare(&DefaultData, &data, sizeof(Data)) == 0 &&
+ lutTexture == nullptr &&
+ toneMapping.Mode == ToneMappingMode::None)
+ return nullptr;
+
// Check if can use volume texture (3D) for a LUT (faster on modern platforms, requires geometry shader)
const auto device = GPUDevice::Instance;
bool use3D = GPU_ALLOW_GEOMETRY_SHADERS && Graphics::PostProcessing::ColorGradingVolumeLUT;
@@ -172,41 +224,8 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
RENDER_TARGET_POOL_SET_NAME(colorGradingBuffer.LUT, "ColorGrading.LUT");
}
- // Prepare the parameters
- Data data;
- data.Dummy = Float2::Zero;
- auto& toneMapping = renderContext.List->Settings.ToneMapping;
- auto& colorGrading = renderContext.List->Settings.ColorGrading;
- // White Balance
- data.WhiteTemp = toneMapping.WhiteTemperature;
- data.WhiteTint = toneMapping.WhiteTint;
- // Shadows
- data.ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
- data.ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
- data.ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
- data.ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
- data.ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
- data.ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
- // Midtones
- data.ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
- data.ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
- data.ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
- data.ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
- data.ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
- // Highlights
- data.ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
- data.ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
- data.ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
- data.ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
- data.ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
- data.ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
- //
- Texture* lutTexture = colorGrading.LutTexture.Get();
- const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
- data.LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
-
// Check if LUT parameter hasn't been changed since the last time
- if (Platform::MemoryCompare(&colorGradingBuffer.CachedData , &data, sizeof(Data)) == 0 &&
+ if (Platform::MemoryCompare(&colorGradingBuffer.CachedData, &data, sizeof(Data)) == 0 &&
colorGradingBuffer.Mode == toneMapping.Mode &&
#if COMPILE_WITH_DEV_ENV
colorGradingBuffer.FrameRendered > _reloadedFrame &&
@@ -232,7 +251,7 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
context->BindCB(0, cb);
context->SetViewportAndScissors((float)lutDesc.Width, (float)lutDesc.Height);
context->SetState(_psLut.Get((int32)toneMapping.Mode));
- context->BindSR(0, useLut ? lutTexture->GetTexture() : nullptr);
+ context->BindSR(0, lutTexture);
#if GPU_ALLOW_GEOMETRY_SHADERS
if (use3D)
{
diff --git a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp
index a954cf31f..25550ecd8 100644
--- a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp
+++ b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp
@@ -11,6 +11,7 @@
#include "Engine/Core/Math/Quaternion.h"
#include "Engine/Core/Config/GraphicsSettings.h"
#include "Engine/Engine/Engine.h"
+#include "Engine/Engine/Units.h"
#include "Engine/Content/Content.h"
#include "Engine/Debug/DebugDraw.h"
#include "Engine/Graphics/GPUContext.h"
@@ -41,6 +42,7 @@
#define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
#define DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE 8
#define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
+#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
#define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
#define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging
@@ -68,11 +70,14 @@ GPU_CB_STRUCT(Data0 {
Int4 ProbeScrollClears[4];
Float3 ViewDir;
float Padding1;
+ Float3 QuantizationError;
+ int32 FrameIndexMod8;
});
GPU_CB_STRUCT(Data1 {
// TODO: use push constants on Vulkan or root signature data on DX12 to reduce overhead of changing single DWORD
- Float2 Padding2;
+ float Padding2;
+ int32 StepSize;
uint32 CascadeIndex;
uint32 ProbeIndexOffset;
});
@@ -214,6 +219,7 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
return true;
_csClassify = shader->GetCS("CS_Classify");
_csUpdateProbesInitArgs = shader->GetCS("CS_UpdateProbesInitArgs");
+ _csUpdateInactiveProbes = shader->GetCS("CS_UpdateInactiveProbes");
_csTraceRays[0] = shader->GetCS("CS_TraceRays", 0);
_csTraceRays[1] = shader->GetCS("CS_TraceRays", 1);
_csTraceRays[2] = shader->GetCS("CS_TraceRays", 2);
@@ -245,6 +251,7 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
LastFrameShaderReload = Engine::FrameCount;
_csClassify = nullptr;
_csUpdateProbesInitArgs = nullptr;
+ _csUpdateInactiveProbes = nullptr;
_csTraceRays[0] = nullptr;
_csTraceRays[1] = nullptr;
_csTraceRays[2] = nullptr;
@@ -322,7 +329,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
const float indirectLightingIntensity = settings.Intensity;
const float probeHistoryWeight = Math::Clamp(settings.TemporalResponse, 0.0f, 0.98f);
const float distance = settings.Distance;
- const Color fallbackIrradiance = settings.FallbackIrradiance;
// Automatically calculate amount of cascades to cover the GI distance at the current probes spacing
const int32 idealProbesCount = 20; // Ideal amount of probes per-cascade to try to fit in order to cover whole distance
@@ -335,7 +341,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
}
// Calculate the probes count based on the amount of cascades and the distance to cover
- const float cascadesDistanceScales[] = { 1.0f, 3.0f, 6.0f, 10.0f }; // Scales each cascade further away from the camera origin
+ const float cascadesDistanceScales[] = { 1.0f, 3.0f, 5.0f, 10.0f }; // Scales each cascade further away from the camera origin
const float distanceExtent = distance / cascadesDistanceScales[cascadesCount - 1];
const float verticalRangeScale = 0.8f; // Scales the probes volume size at Y axis (horizontal aspect ratio makes the DDGI use less probes vertically to cover whole screen)
Int3 probesCounts(Float3::Ceil(Float3(distanceExtent, distanceExtent * verticalRangeScale, distanceExtent) / probesSpacing));
@@ -351,6 +357,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
// Initialize cascades
float probesSpacings[4];
Float3 viewOrigins[4];
+ Float3 blendOrigins[4];
for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++)
{
// Each cascade has higher spacing between probes
@@ -361,14 +368,15 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
// Calculate view origin for cascade by shifting it towards the view direction to account for better view frustum coverage
Float3 viewOrigin = renderContext.View.Position;
Float3 viewDirection = renderContext.View.Direction;
- const Float3 probesDistance = Float3(probesCounts) * cascadeProbesSpacing;
+ const Float3 probesDistance = Float3(probesCounts - 1) * cascadeProbesSpacing;
const float probesDistanceMax = probesDistance.MaxValue();
const Float3 viewRayHit = CollisionsHelper::LineHitsBox(viewOrigin, viewOrigin + viewDirection * (probesDistanceMax * 2.0f), viewOrigin - probesDistance, viewOrigin + probesDistance);
const float viewOriginOffset = viewRayHit.Y * probesDistanceMax * 0.6f;
viewOrigin += viewDirection * viewOriginOffset;
+ //viewOrigin = Float3::Zero;
+ blendOrigins[cascadeIndex] = viewOrigin;
const float viewOriginSnapping = cascadeProbesSpacing;
viewOrigin = Float3::Floor(viewOrigin / viewOriginSnapping) * viewOriginSnapping;
- //viewOrigin = Float3::Zero;
viewOrigins[cascadeIndex] = viewOrigin;
}
@@ -500,6 +508,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
{
auto& cascade = ddgiData.Cascades[cascadeIndex];
ddgiData.Result.Constants.ProbesOriginAndSpacing[cascadeIndex] = Float4(cascade.ProbesOrigin, cascade.ProbesSpacing);
+ ddgiData.Result.Constants.BlendOrigin[cascadeIndex] = Float4(blendOrigins[cascadeIndex], 0.0f);
ddgiData.Result.Constants.ProbesScrollOffsets[cascadeIndex] = Int4(cascade.ProbeScrollOffsets, 0);
}
ddgiData.Result.Constants.RayMaxDistance = distance;
@@ -508,7 +517,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
ddgiData.Result.Constants.ProbeHistoryWeight = probeHistoryWeight;
ddgiData.Result.Constants.IrradianceGamma = 1.5f;
ddgiData.Result.Constants.IndirectLightingIntensity = indirectLightingIntensity;
- ddgiData.Result.Constants.FallbackIrradiance = fallbackIrradiance.ToFloat3() * fallbackIrradiance.A;
+ ddgiData.Result.Constants.FallbackIrradiance = settings.FallbackIrradiance.ToFloat4();
ddgiData.Result.ProbesData = ddgiData.ProbesData->View();
ddgiData.Result.ProbesDistance = ddgiData.ProbesDistance->View();
ddgiData.Result.ProbesIrradiance = ddgiData.ProbesIrradiance->View();
@@ -535,6 +544,8 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
data.TemporalTime = renderContext.List->Setup.UseTemporalAAJitter ? RenderTools::ComputeTemporalTime() : 0.0f;
data.ViewDir = renderContext.View.Direction;
data.SkyboxIntensity = renderContext.List->Sky ? renderContext.List->Sky->GetIndirectLightingIntensity() : 1.0f;
+ data.QuantizationError = RenderTools::GetColorQuantizationError(ddgiData.ProbesIrradiance->Format());
+ data.FrameIndexMod8 = (int32)(Engine::FrameCount % 8);
GBufferPass::SetInputs(renderContext.View, data.GBuffer);
context->UpdateCB(_cb0, &data);
context->BindCB(0, _cb0);
@@ -581,6 +592,23 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
context->ResetUA();
}
+ // For inactive probes, search nearby ones to find the closest valid for quick fallback when sampling irradiance
+ {
+ PROFILE_GPU_CPU_NAMED("Update Inactive Probes");
+ // TODO: this could run within GPUComputePass during Trace Rays or Update Probes to overlap compute works
+ context->BindUA(0, ddgiData.Result.ProbesData);
+ Data1 data;
+ data.CascadeIndex = cascadeIndex;
+ int32 iterations = Math::CeilToInt(Math::Log2((float)Math::Min(probesCounts.MaxValue(), DDGI_PROBE_EMPTY_AREA_DENSITY) + 1.0f));
+ for (int32 i = iterations - 1; i >= 0; i--)
+ {
+ data.StepSize = Math::FloorToInt(Math::Pow(2, (float)i) + 0.5f); // Jump Flood step size
+ context->UpdateCB(_cb1, &data);
+ context->Dispatch(_csUpdateInactiveProbes, threadGroupsX, 1, 1);
+ }
+ context->ResetUA();
+ }
+
// Update probes in batches so ProbesTrace texture can be smaller
uint32 arg = 0;
// TODO: use rays allocator to dispatch raytracing in packets (eg. 8 threads in a group instead of hardcoded limit)
diff --git a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h
index e6ace0373..5953da887 100644
--- a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h
+++ b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h
@@ -15,7 +15,8 @@ public:
// Constant buffer data for DDGI access on a GPU.
GPU_CB_STRUCT(ConstantsData {
Float4 ProbesOriginAndSpacing[4];
- Int4 ProbesScrollOffsets[4];
+ Float4 BlendOrigin[4]; // w is unused
+ Int4 ProbesScrollOffsets[4]; // w is unused
uint32 ProbesCounts[3];
uint32 CascadesCount;
float IrradianceGamma;
@@ -24,8 +25,7 @@ public:
float IndirectLightingIntensity;
Float3 ViewPos;
uint32 RaysCount;
- Float3 FallbackIrradiance;
- float Padding0;
+ Float4 FallbackIrradiance;
});
// Binding data for the GPU.
@@ -44,6 +44,7 @@ private:
GPUConstantBuffer* _cb1 = nullptr;
GPUShaderProgramCS* _csClassify;
GPUShaderProgramCS* _csUpdateProbesInitArgs;
+ GPUShaderProgramCS* _csUpdateInactiveProbes;
GPUShaderProgramCS* _csTraceRays[4];
GPUShaderProgramCS* _csUpdateProbesIrradiance;
GPUShaderProgramCS* _csUpdateProbesDistance;
diff --git a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp
index ce0ec1881..7216a8fa8 100644
--- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp
+++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp
@@ -428,6 +428,7 @@ public:
// Write to objects buffer (this must match unpacking logic in HLSL)
uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4);
ObjectsListBuffer.Write(objectAddress);
+ ObjectsBuffer.Data.EnsureCapacity(ObjectsBuffer.Data.Count() + sizeof(Float4) * (GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE + 6 * GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE));
auto* objectData = ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE);
objectData[0] = Float4(object.Position, object.Radius);
objectData[1] = Float4::Zero;
@@ -511,6 +512,7 @@ public:
{
// Dirty object to redraw
object->LastFrameUpdated = 0;
+ return;
}
GlobalSurfaceAtlasLight* light = Lights.TryGet(a->GetID());
if (light)
diff --git a/Source/Engine/Renderer/PostProcessingPass.cpp b/Source/Engine/Renderer/PostProcessingPass.cpp
index 030541e4c..5ac204523 100644
--- a/Source/Engine/Renderer/PostProcessingPass.cpp
+++ b/Source/Engine/Renderer/PostProcessingPass.cpp
@@ -269,7 +269,7 @@ void PostProcessingPass::Render(RenderContext& renderContext, GPUTexture* input,
int32 bloomMipCount = CalculateBloomMipCount(w1, h1);
// Ensure to have valid data and if at least one effect should be applied
- if (!(useBloom || useToneMapping || useCameraArtifacts) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
+ if (!(useBloom || useToneMapping || useCameraArtifacts || colorGradingLUT) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
{
// Resources are missing. Do not perform rendering. Just copy raw frame
context->SetViewportAndScissors((float)output->Width(), (float)output->Height());
diff --git a/Source/Engine/Renderer/Renderer.cpp b/Source/Engine/Renderer/Renderer.cpp
index 7a72cd923..fd7d43c8b 100644
--- a/Source/Engine/Renderer/Renderer.cpp
+++ b/Source/Engine/Renderer/Renderer.cpp
@@ -402,6 +402,8 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont
case ViewMode::MaterialComplexity:
case ViewMode::Wireframe:
case ViewMode::NoPostFx:
+ case ViewMode::VertexColors:
+ case ViewMode::QuadOverdraw:
setup.UseTemporalAAJitter = false;
break;
}
diff --git a/Source/Engine/Scripting/Scripting.cs b/Source/Engine/Scripting/Scripting.cs
index 7f9f2980c..229e411f3 100644
--- a/Source/Engine/Scripting/Scripting.cs
+++ b/Source/Engine/Scripting/Scripting.cs
@@ -137,8 +137,8 @@ namespace FlaxEngine
{
Debug.LogError($"Unhandled Exception: {exception.Message}");
Debug.LogException(exception);
- if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
- Platform.Fatal($"Unhandled Exception: {exception}");
+ //if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
+ // Platform.Fatal($"Unhandled Exception: {exception}");
}
}
diff --git a/Source/Engine/UI/GUI/Panels/DropPanel.cs b/Source/Engine/UI/GUI/Panels/DropPanel.cs
index de80f9fc5..308272218 100644
--- a/Source/Engine/UI/GUI/Panels/DropPanel.cs
+++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs
@@ -11,6 +11,11 @@ namespace FlaxEngine.GUI
[ActorToolbox("GUI")]
public class DropPanel : ContainerControl
{
+ ///
+ /// Size of the drop down icon.
+ ///
+ public const float DropDownIconSize = 14.0f;
+
///
/// The header height.
///
@@ -368,7 +373,7 @@ namespace FlaxEngine.GUI
var style = Style.Current;
var enabled = EnabledInHierarchy;
- // Paint Background
+ // Draw Background
var backgroundColor = BackgroundColor;
if (backgroundColor.A > 0.0f)
{
@@ -386,7 +391,7 @@ namespace FlaxEngine.GUI
float textLeft = 0;
if (EnableDropDownIcon)
{
- textLeft += 14;
+ textLeft += DropDownIconSize;
var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12);
var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey;
if (_isClosed)
@@ -395,7 +400,7 @@ namespace FlaxEngine.GUI
ArrowImageOpened?.Draw(dropDownRect, arrowColor);
}
- // Text
+ // Header text
var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight);
_headerTextMargin.ShrinkRectangle(ref textRect);
var textColor = HeaderTextColor;
@@ -404,7 +409,9 @@ namespace FlaxEngine.GUI
textColor *= 0.6f;
}
+ Render2D.PushClip(textRect);
Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center);
+ Render2D.PopClip();
if (!_isClosed && EnableContainmentLines)
{
diff --git a/Source/Shaders/GI/DDGI.hlsl b/Source/Shaders/GI/DDGI.hlsl
index c116b597a..b88b846a6 100644
--- a/Source/Shaders/GI/DDGI.hlsl
+++ b/Source/Shaders/GI/DDGI.hlsl
@@ -20,17 +20,23 @@
#define DDGI_PROBE_ATTENTION_MAX 0.98f // Maximum probe attention value that still makes it active (but not activated which is 1.0f).
#define DDGI_PROBE_RESOLUTION_IRRADIANCE 6 // Resolution (in texels) for probe irradiance data (excluding 1px padding on each side)
#define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
-#define DDGI_CASCADE_BLEND_SIZE 2.5f // Distance in probes over which cascades blending happens
+#define DDGI_CASCADE_BLEND_SIZE 2.0f // Distance in probes over which cascades blending happens
#ifndef DDGI_CASCADE_BLEND_SMOOTH
#define DDGI_CASCADE_BLEND_SMOOTH 0 // Enables smooth cascade blending, otherwise dithering will be used
#endif
#define DDGI_SRGB_BLENDING 1 // Enables blending in sRGB color space, otherwise irradiance blending is done in linear space
+#define DDGI_DEFAULT_BIAS 0.2f // Default value for DDGI sampling bias
+#define DDGI_FALLBACK_COORDS_ENCODE(coord) ((float3)(coord + 1) / 128.0f)
+#define DDGI_FALLBACK_COORDS_DECODE(data) (uint3)(data.xyz * 128.0f - 1)
+#define DDGI_FALLBACK_COORDS_VALID(data) (length(data.xyz) > 0)
+//#define DDGI_DEBUG_CASCADE 0 // Forces a specific cascade to be only in use (for debugging)
// DDGI data for a constant buffer
struct DDGIData
{
float4 ProbesOriginAndSpacing[4];
- int4 ProbesScrollOffsets[4]; // w unused
+ float4 BlendOrigin[4]; // w is unused
+ int4 ProbesScrollOffsets[4]; // w is unused
uint3 ProbesCounts;
uint CascadesCount;
float IrradianceGamma;
@@ -39,8 +45,7 @@ struct DDGIData
float IndirectLightingIntensity;
float3 ViewPos;
uint RaysCount;
- float3 FallbackIrradiance;
- float Padding0;
+ float4 FallbackIrradiance;
};
uint GetDDGIProbeIndex(DDGIData data, uint3 probeCoords)
@@ -159,6 +164,8 @@ float2 GetDDGIProbeUV(DDGIData data, uint cascadeIndex, uint probeIndex, float2
float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probesData, Texture2D probesDistance, Texture2D probesIrradiance, float3 worldPosition, float3 worldNormal, uint cascadeIndex, float3 probesOrigin, float3 probesExtent, float probesSpacing, float3 biasedWorldPosition)
{
+ bool invalidCascade = cascadeIndex >= data.CascadesCount;
+ cascadeIndex = min(cascadeIndex, data.CascadesCount - 1);
uint3 probeCoordsEnd = data.ProbesCounts - uint3(1, 1, 1);
uint3 baseProbeCoords = clamp(uint3((worldPosition - probesOrigin + probesExtent) / probesSpacing), uint3(0, 0, 0), probeCoordsEnd);
@@ -168,7 +175,6 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
// Loop over the closest probes to accumulate their contributions
float4 irradiance = float4(0, 0, 0, 0);
- const int3 SearchAxisMasks[3] = { int3(1, 0, 0), int3(0, 1, 0), int3(0, 0, 1) };
for (uint i = 0; i < 8; i++)
{
uint3 probeCoordsOffset = uint3(i, i >> 1, i >> 2) & 1;
@@ -178,33 +184,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
// Load probe position and state
float4 probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
uint probeState = DecodeDDGIProbeState(probeData);
+ uint useVisibility = true;
+ float minWight = 0.000001f;
if (probeState == DDGI_PROBE_STATE_INACTIVE)
{
- // Search nearby probes to find any nearby GI sample
- for (int searchDistance = 1; searchDistance < 3 && probeState == DDGI_PROBE_STATE_INACTIVE; searchDistance++)
- for (uint searchAxis = 0; searchAxis < 3; searchAxis++)
- {
- int searchAxisDir = probeCoordsOffset[searchAxis] ? 1 : -1;
- int3 searchCoordsOffset = SearchAxisMasks[searchAxis] * searchAxisDir * searchDistance;
- uint3 searchCoords = clamp((int3)probeCoords + searchCoordsOffset, int3(0, 0, 0), (int3)probeCoordsEnd);
- uint searchIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, searchCoords);
- float4 searchData = LoadDDGIProbeData(data, probesData, cascadeIndex, searchIndex);
- uint searchState = DecodeDDGIProbeState(searchData);
- if (searchState != DDGI_PROBE_STATE_INACTIVE)
- {
- // Use nearby probe as a fallback (visibility test might ignore it but with smooth gradient)
- probeCoords = searchCoords;
- probeIndex = searchIndex;
- probeData = searchData;
- probeState = searchState;
- break;
- }
- }
- if (probeState == DDGI_PROBE_STATE_INACTIVE)
- continue;
+ // Use fallback probe that is closest to this one
+ uint3 fallbackCoords = DDGI_FALLBACK_COORDS_DECODE(probeData);
+ float fallbackToProbeDist = length((float3)probeCoords - (float3)fallbackCoords);
+ useVisibility = fallbackToProbeDist <= 1.0f; // Skip visibility test that blocks too far probes due to limiting max distance to 1.5 of probe spacing
+ if (fallbackToProbeDist > 2.0f) minWight = 1.0f;
+ probeCoords = fallbackCoords;
+ probeIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, fallbackCoords);
+ probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
+ //if (DecodeDDGIProbeState(probeData) == DDGI_PROBE_STATE_INACTIVE) continue;
}
- float3 probeBasePosition = baseProbeWorldPosition + ((probeCoords - baseProbeCoords) * probesSpacing);
- float3 probePosition = probeBasePosition + probeData.xyz * probesSpacing; // Probe offset is [-1;1] within probes spacing
+
+ // Calculate probe position
+ float3 probePosition = baseProbeWorldPosition + (((float3)probeCoords - (float3)baseProbeCoords) * probesSpacing) + probeData.xyz * probesSpacing;
// Calculate the distance and direction from the (biased and non-biased) shading point and the probe
float3 worldPosToProbe = normalize(probePosition - worldPosition);
@@ -213,6 +209,7 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
// Smooth backface test
float weight = Square(dot(worldPosToProbe, worldNormal) * 0.5f + 0.5f);
+ weight = max(weight, 0.1f);
// Sample distance texture
float2 octahedralCoords = GetOctahedralCoords(-biasedPosToProbe);
@@ -220,24 +217,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
float2 probeDistance = probesDistance.SampleLevel(SamplerLinearClamp, uv, 0).rg * 2.0f;
// Visibility weight (Chebyshev)
- if (biasedPosToProbeDist > probeDistance.x)
+ if (biasedPosToProbeDist > probeDistance.x && useVisibility)
{
float variance = abs(Square(probeDistance.x) - probeDistance.y);
float visibilityWeight = variance / (variance + Square(biasedPosToProbeDist - probeDistance.x));
- weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.05f);
+ weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.0f);
}
// Avoid a weight of zero
- weight = max(weight, 0.000001f);
+ weight = max(weight, minWight);
// Adjust weight curve to inject a small portion of light
const float minWeightThreshold = 0.2f;
- if (weight < minWeightThreshold)
- weight *= Square(weight) / Square(minWeightThreshold);
+ if (weight < minWeightThreshold) weight *= (weight * weight) * (1.0f / (minWeightThreshold * minWeightThreshold));
// Calculate trilinear weights based on the distance to each probe to smoothly transition between grid of 8 probes
float3 trilinear = lerp(1.0f - biasAlpha, biasAlpha, (float3)probeCoordsOffset);
- weight *= max(trilinear.x * trilinear.y * trilinear.z, 0.001f);
+ weight *= saturate(trilinear.x * trilinear.y * trilinear.z * 2.0f);
// Sample irradiance texture
octahedralCoords = GetOctahedralCoords(worldNormal);
@@ -269,7 +265,9 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
if (irradiance.a > 0.0f)
{
// Normalize irradiance
- irradiance.rgb /= irradiance.a;
+ //irradiance.rgb /= irradiance.a;
+ //irradiance.rgb /= lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
+ irradiance.rgb /= invalidCascade ? irradiance.a : lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
#if DDGI_SRGB_BLENDING
irradiance.rgb *= irradiance.rgb;
#endif
@@ -281,22 +279,34 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D probes
float3 GetDDGISurfaceBias(float3 viewDir, float probesSpacing, float3 worldNormal, float bias)
{
// Bias the world-space position to reduce artifacts
- return (worldNormal * 0.2f + viewDir * 0.8f) * (0.75f * probesSpacing * bias);
+ return (worldNormal * 0.2f + viewDir * 0.8f) * (0.6f * probesSpacing * bias);
+}
+
+// [Inigo Quilez, https://iquilezles.org/articles/distfunctions/]
+float sdRoundBox(float3 p, float3 b, float r)
+{
+ float3 q = abs(p) - b + r;
+ return length(max(q, 0.0f)) + min(max(q.x, max(q.y, q.z)), 0.0f) - r;
}
// Samples DDGI probes volume at the given world-space position and returns the irradiance.
// bias - scales the bias vector to the initial sample point to reduce self-shading artifacts
// dither - randomized per-pixel value in range 0-1, used to smooth dithering for cascades blending
-float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, Texture2D probesDistance, Texture2D probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = 0.2f, float dither = 0.0f)
+float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, Texture2D probesDistance, Texture2D probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = DDGI_DEFAULT_BIAS, float dither = 0.0f)
{
// Select the highest cascade that contains the sample location
- uint cascadeIndex = 0;
float probesSpacing = 0, cascadeWeight = 0;
float3 probesOrigin = (float3)0, probesExtent = (float3)0, biasedWorldPosition = (float3)0;
float3 viewDir = normalize(data.ViewPos - worldPosition);
#if DDGI_CASCADE_BLEND_SMOOTH
dither = 0.0f;
#endif
+#ifdef DDGI_DEBUG_CASCADE
+ uint cascadeIndex = DDGI_DEBUG_CASCADE;
+#else
+ uint cascadeIndex = 0;
+ if (data.CascadesCount == 0)
+ return float3(0, 0, 0);
for (; cascadeIndex < data.CascadesCount; cascadeIndex++)
{
// Get cascade data
@@ -306,26 +316,21 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, T
biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);
// Calculate cascade blending weight (use input bias to smooth transition)
- float cascadeBlendSmooth = frac(max(distance(data.ViewPos, worldPosition) - probesExtent.x, 0) / probesSpacing) * 0.1f;
- float3 cascadeBlendPoint = worldPosition - probesOrigin - cascadeBlendSmooth * probesSpacing;
float fadeDistance = probesSpacing * DDGI_CASCADE_BLEND_SIZE;
-#if DDGI_CASCADE_BLEND_SMOOTH
- fadeDistance *= 2.0f; // Make it even smoother when using linear blending
-#endif
- cascadeWeight = saturate(Min3(probesExtent - abs(cascadeBlendPoint)) / fadeDistance);
+ float3 blendPos = worldPosition - data.BlendOrigin[cascadeIndex].xyz;
+ cascadeWeight = sdRoundBox(blendPos, probesExtent - probesSpacing, probesSpacing * 2) + fadeDistance;
+ cascadeWeight = 1 - saturate(cascadeWeight / fadeDistance);
if (cascadeWeight > dither)
break;
}
- if (cascadeIndex == data.CascadesCount)
- return data.FallbackIrradiance;
+#endif
// Sample cascade
float3 result = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);
// Blend with the next cascade (or fallback irradiance outside the volume)
+#if DDGI_CASCADE_BLEND_SMOOTH && !defined(DDGI_DEBUG_CASCADE)
cascadeIndex++;
-#if DDGI_CASCADE_BLEND_SMOOTH
- result *= cascadeWeight;
if (cascadeIndex < data.CascadesCount && cascadeWeight < 0.99f)
{
probesSpacing = data.ProbesOriginAndSpacing[cascadeIndex].w;
@@ -333,18 +338,16 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D probesData, T
probesExtent = (data.ProbesCounts - 1) * (probesSpacing * 0.5f);
biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);
float3 resultNext = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);
+ result *= cascadeWeight;
result += resultNext * (1 - cascadeWeight);
}
- else
- {
- result += data.FallbackIrradiance * (1 - cascadeWeight);
- }
-#else
- if (cascadeIndex == data.CascadesCount)
- {
- result += data.FallbackIrradiance * (1 - cascadeWeight);
- }
#endif
+ if (cascadeIndex >= data.CascadesCount)
+ {
+ // Blend between the last cascade and the fallback irradiance
+ float fallbackWeight = (1 - cascadeWeight) * data.FallbackIrradiance.a;
+ result = lerp(result, data.FallbackIrradiance.rgb, fallbackWeight);
+ }
return result;
}
diff --git a/Source/Shaders/GI/DDGI.shader b/Source/Shaders/GI/DDGI.shader
index daad2018d..b080efc0b 100644
--- a/Source/Shaders/GI/DDGI.shader
+++ b/Source/Shaders/GI/DDGI.shader
@@ -13,6 +13,7 @@
#include "./Flax/Math.hlsl"
#include "./Flax/Noise.hlsl"
#include "./Flax/Quaternion.hlsl"
+#include "./Flax/MonteCarlo.hlsl"
#include "./Flax/GlobalSignDistanceField.hlsl"
#include "./Flax/GI/GlobalSurfaceAtlas.hlsl"
#include "./Flax/GI/DDGI.hlsl"
@@ -26,6 +27,7 @@
#define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
#define DDGI_PROBE_RELOCATE_ITERATIVE 1 // If true, probes relocation algorithm tries to move them in additive way, otherwise all nearby locations are checked to find the best position
#define DDGI_PROBE_RELOCATE_FIND_BEST 1 // If true, probes relocation algorithm tries to move to the best matching location within nearby area
+#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
#define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
#define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging
@@ -42,10 +44,13 @@ float TemporalTime;
int4 ProbeScrollClears[4];
float3 ViewDir;
float Padding1;
+float3 QuantizationError;
+uint FrameIndexMod8;
META_CB_END
META_CB_BEGIN(1, Data1)
-float2 Padding2;
+float Padding2;
+int StepSize;
uint CascadeIndex;
uint ProbeIndexOffset;
META_CB_END
@@ -98,6 +103,11 @@ float3 Remap(float3 value, float3 fromMin, float3 fromMax, float3 toMin, float3
return (value - fromMin) / (fromMax - fromMin) * (toMax - toMin) + toMin;
}
+bool IsProbeAtBorder(uint3 probeCoords)
+{
+ return min(probeCoords.x, min(probeCoords.y, probeCoords.z)) == 0 || probeCoords.x == DDGI.ProbesCounts.x - 1 || probeCoords.y == DDGI.ProbesCounts.y - 1 || probeCoords.z == DDGI.ProbesCounts.z - 1;
+}
+
// Compute shader for updating probes state between active and inactive and performing probes relocation.
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
@@ -112,6 +122,14 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
float3 probeBasePosition = GetDDGIProbeWorldPosition(DDGI, CascadeIndex, probeCoords);
+#ifdef DDGI_DEBUG_CASCADE
+ // Single cascade-only debugging
+ if (CascadeIndex != DDGI_DEBUG_CASCADE)
+ {
+ RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
+ return;
+ }
+#else
// Disable probes that are is in the range of higher-quality cascade
if (CascadeIndex > 0)
{
@@ -119,15 +137,15 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float prevProbesSpacing = DDGI.ProbesOriginAndSpacing[prevCascade].w;
float3 prevProbesOrigin = DDGI.ProbesScrollOffsets[prevCascade].xyz * prevProbesSpacing + DDGI.ProbesOriginAndSpacing[prevCascade].xyz;
float3 prevProbesExtent = (DDGI.ProbesCounts - 1) * (prevProbesSpacing * 0.5f);
- prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE); // Apply safe margin to allow probes on cascade edges
+ prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE) * 2; // Apply safe margin to allow probes on cascade edges
float prevCascadeWeight = Min3(prevProbesExtent - abs(probeBasePosition - prevProbesOrigin));
if (prevCascadeWeight > 0.1f)
{
- // Disable probe
RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
return;
}
}
+#endif
// Check if probe was scrolled
int3 probeScrollClears = ProbeScrollClears[CascadeIndex].xyz;
@@ -171,9 +189,29 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float voxelLimit = GlobalSDF.CascadeVoxelSize[CascadeIndex] * 0.8f;
float distanceLimit = probesSpacing * ProbesDistanceLimits[CascadeIndex];
float relocateLimit = probesSpacing * ProbesRelocateLimits[CascadeIndex];
- if (sdfDst > distanceLimit + length(probeOffset)) // Probe is too far from geometry (or deep inside)
+#ifdef DDGI_PROBE_EMPTY_AREA_DENSITY
+ uint3 probeCoordsStable = GetDDGIProbeCoords(DDGI, probeIndex);
+ if (sdf > probesSpacing * DDGI.ProbesCounts.x * 0.3f
+#if DDGI_PROBE_EMPTY_AREA_DENSITY > 1
+ && (
+ // Low-density grid grid
+ (probeCoordsStable.x % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.y % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.z % DDGI_PROBE_EMPTY_AREA_DENSITY == 0)
+ // Edge probes at the last cascade (for good fallback irradiance outside the GI distance)
+ //|| (CascadeIndex + 1 == DDGI.CascadesCount && IsProbeAtBorder(probeCoords))
+ )
+#endif
+ )
{
- // Disable it
+ // Addd some fallback probes in empty areas to provide valid GI for nearby dynamic objects or transparency
+ probeOffset = float3(0, 0, 0);
+ probeState = wasScrolled || probeStateOld == DDGI_PROBE_STATE_INACTIVE ? DDGI_PROBE_STATE_ACTIVATED : DDGI_PROBE_STATE_ACTIVE;
+ probeAttention = DDGI_PROBE_ATTENTION_MIN;
+ }
+ else
+#endif
+ if (sdfDst > distanceLimit + length(probeOffset))
+ {
+ // Probe is too far from geometry (or deep inside) so disable it
probeOffset = float3(0, 0, 0);
probeState = DDGI_PROBE_STATE_INACTIVE;
probeAttention = 0.0f;
@@ -194,6 +232,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
probeAttention = clamp(probeAttention, DDGI_PROBE_ATTENTION_MIN, DDGI_PROBE_ATTENTION_MAX);
// Relocate only if probe location is not good enough
+ BRANCH
if (sdf <= voxelLimit)
{
#if DDGI_PROBE_RELOCATE_ITERATIVE
@@ -265,6 +304,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
bool wasActivated = probeStateOld == DDGI_PROBE_STATE_INACTIVE;
bool wasRelocated = distance(probeOffset, probeOffsetOld) > 2.0f;
#if DDGI_PROBE_RELOCATE_FIND_BEST || DDGI_PROBE_RELOCATE_ITERATIVE
+ BRANCH
if (wasRelocated && !wasActivated)
{
// If probe was relocated but the previous location is visible from the new one, then don't re-activate it for smoother blend
@@ -323,6 +363,78 @@ void CS_UpdateProbesInitArgs()
#endif
+#ifdef _CS_UpdateInactiveProbes
+
+RWTexture2D RWProbesData : register(u0);
+
+void CheckNearbyProbe(inout uint3 fallbackCoords, inout uint probeState, inout float minDistance, uint3 probeCoords, int3 probeCoordsEnd, int3 offset)
+{
+ uint3 nearbyCoords = (uint3)clamp(((int3)probeCoords + offset), int3(0, 0, 0), probeCoordsEnd);
+ uint nearbyIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, nearbyCoords);
+ float4 nearbyData = RWProbesData[GetDDGIProbeTexelCoords(DDGI, CascadeIndex, nearbyIndex)];
+ float nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
+ if (DecodeDDGIProbeState(nearbyData) != DDGI_PROBE_STATE_INACTIVE && nearbyDist < minDistance)
+ {
+ // Use nearby probe
+ fallbackCoords = nearbyCoords;
+ probeState = DDGI_PROBE_STATE_ACTIVE;
+ minDistance = nearbyDist;
+ return;
+ }
+ nearbyCoords = DDGI_FALLBACK_COORDS_DECODE(nearbyData);
+ nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
+ if (DDGI_FALLBACK_COORDS_VALID(nearbyData) && nearbyDist < minDistance)
+ {
+ // Use fallback probe
+ fallbackCoords = nearbyCoords;
+ probeState = DDGI_PROBE_STATE_ACTIVE;
+ minDistance = nearbyDist;
+ }
+}
+
+// Compute shader to store closest valid probe coords inside inactive probes data for quick fallback lookup when sampling irradiance.
+// Uses Jump Flood algorithm.
+META_CS(true, FEATURE_LEVEL_SM5)
+[numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
+void CS_UpdateInactiveProbes(uint3 DispatchThreadId : SV_DispatchThreadID)
+{
+ uint probeIndex = min(DispatchThreadId.x, ProbesCount - 1);
+ uint3 fallbackCoords = uint3(1000, 1000, 1000);
+
+ // Load probe data for the current thread
+ uint3 probeCoords = GetDDGIProbeCoords(DDGI, probeIndex);
+ probeIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, probeCoords);
+ int2 probeDataCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex);
+ float4 probeData = RWProbesData[probeDataCoords];
+ uint probeState = DecodeDDGIProbeState(probeData);
+ BRANCH
+ if (probeState == DDGI_PROBE_STATE_INACTIVE)
+ {
+ // Find the closest active probe (Jump Flood)
+ int3 probeCoordsEnd = (int3)DDGI.ProbesCounts - int3(1, 1, 1);
+ float minDistance = 1e27f;
+ UNROLL for (int z = -1; z <= 1; z++)
+ UNROLL for (int y = -1; y <= 1; y++)
+ UNROLL for (int x = -1; x <= 1; x++)
+ {
+ int3 offset = int3(x, y, z) * StepSize;
+ CheckNearbyProbe(fallbackCoords, probeState, minDistance, probeCoords, probeCoordsEnd, offset);
+ }
+ }
+
+ // Ensure all threads (within dispatch) got proper data before writing back to the same memory
+ AllMemoryBarrierWithGroupSync();
+
+ // Write modified probe data back (remain inactive)
+ BRANCH
+ if (probeState != DDGI_PROBE_STATE_INACTIVE && DispatchThreadId.x < ProbesCount && fallbackCoords.x != 1000)
+ {
+ RWProbesData[probeDataCoords] = EncodeDDGIProbeData(DDGI_FALLBACK_COORDS_ENCODE(fallbackCoords), DDGI_PROBE_STATE_INACTIVE, 0.0f);
+ }
+}
+
+#endif
+
#ifdef _CS_TraceRays
RWTexture2D RWProbesTrace : register(u0);
@@ -392,6 +504,8 @@ void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)
// Add some bias to prevent self occlusion artifacts in Chebyshev due to Global SDF being very incorrect in small scale
radiance.w = max(radiance.w + GlobalSDF.CascadeVoxelSize[hit.HitCascade] * 0.5f, 0);
+ float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
+ radiance.w += probesSpacing * 0.05f;
}
}
else
@@ -639,7 +753,7 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
// Add distance (R), distance^2 (G) and weight (A)
float rayDistance = CachedProbesTraceDistance[rayIndex];
- result += float4(rayDistance * rayWeight, (rayDistance * rayDistance) * rayWeight, 0.0f, rayWeight);
+ result += float4(rayDistance, rayDistance * rayDistance, 0.0f, 1.0f) * rayWeight;
#endif
}
@@ -700,13 +814,17 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
//result.rgb = previous + (irradianceDelta * 0.25f);
}
result = float4(lerp(result.rgb, previous.rgb, historyWeight), 1.0f);
+
+ // Apply quantization error to reduce yellowish artifacts due to R11G11B10 format
+ float noise = InterleavedGradientNoise(octahedralCoords, FrameIndexMod8);
+ result.rgb = QuantizeColor(result.rgb, noise, QuantizationError);
#else
result = float4(lerp(result.rg, previous.rg, historyWeight), 0.0f, 1.0f);
#endif
RWOutput[outputCoords] = result;
-
GroupMemoryBarrierWithGroupSync();
+
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
#if DDGI_PROBE_UPDATE_MODE == 0
@@ -786,10 +904,10 @@ void PS_IndirectLighting(Quad_VS2PS input, out float4 output : SV_Target0)
}
// Sample irradiance
- float bias = 0.2f;
float dither = RandN2(input.TexCoord + TemporalTime).x;
- float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, gBuffer.WorldPos, gBuffer.Normal, bias, dither);
-
+ float3 samplePos = gBuffer.WorldPos + gBuffer.Normal * (dither * 0.1f + 0.1f);
+ float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, samplePos, gBuffer.Normal, DDGI_DEFAULT_BIAS, dither);
+
// Calculate lighting
float3 diffuseColor = GetDiffuseColor(gBuffer);
float3 diffuse = Diffuse_Lambert(diffuseColor);
diff --git a/Source/Shaders/GI/GlobalSurfaceAtlas.shader b/Source/Shaders/GI/GlobalSurfaceAtlas.shader
index 6778a7cd7..6930107d1 100644
--- a/Source/Shaders/GI/GlobalSurfaceAtlas.shader
+++ b/Source/Shaders/GI/GlobalSurfaceAtlas.shader
@@ -328,7 +328,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
viewRay = normalize(viewRay - ViewWorldPos);
trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
- trace.NeedsHitNormal = true;
GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);
float3 color;
@@ -337,7 +336,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
// Sample Global Surface Atlas at the hit location
float surfaceThreshold = GetGlobalSurfaceAtlasThreshold(GlobalSDF, hit);
color = SampleGlobalSurfaceAtlas(GlobalSurfaceAtlas, GlobalSurfaceAtlasChunks, GlobalSurfaceAtlasCulledObjects, GlobalSurfaceAtlasObjects, GlobalSurfaceAtlasDepth, GlobalSurfaceAtlasTex, hit.GetHitPosition(trace), -viewRay, surfaceThreshold).rgb;
- //color = hit.HitNormal * 0.5f + 0.5f;
}
else
{
diff --git a/Source/Shaders/GlobalSignDistanceField.hlsl b/Source/Shaders/GlobalSignDistanceField.hlsl
index 8075c081d..c1bd4250b 100644
--- a/Source/Shaders/GlobalSignDistanceField.hlsl
+++ b/Source/Shaders/GlobalSignDistanceField.hlsl
@@ -32,17 +32,13 @@ struct GlobalSDFTrace
float MinDistance;
float3 WorldDirection;
float MaxDistance;
- float StepScale;
- bool NeedsHitNormal;
- void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance, float stepScale = 1.0f)
+ void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance)
{
WorldPosition = worldPosition;
WorldDirection = worldDirection;
MinDistance = minDistance;
MaxDistance = maxDistance;
- StepScale = stepScale;
- NeedsHitNormal = false;
}
};
@@ -75,12 +71,23 @@ void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldP
textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
}
-// Clamps Global SDF cascade UV to ensure it can be sued for gradient sampling (clamps first and last pixels).
+void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldPosition, out float3 cascadeUV, out float3 textureUV, out float3 textureMipUV)
+{
+ float4 cascadePosDistance = data.CascadePosDistance[cascade];
+ float3 posInCascade = worldPosition - cascadePosDistance.xyz;
+ float cascadeSize = cascadePosDistance.w * 2;
+ cascadeUV = saturate(posInCascade / cascadeSize + 0.5f);
+ textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
+ float halfTexelOffsetMip = (GLOBAL_SDF_RASTERIZE_MIP_FACTOR * 0.5f) / data.Resolution;
+ textureMipUV = textureUV + float3(halfTexelOffsetMip / (float)data.CascadesCount, halfTexelOffsetMip, halfTexelOffsetMip); // Mipmaps are offset by half texel to sample correctly
+}
+
+// Clamps Global SDF cascade UV to ensure it can be used for gradient sampling (clamps first and last pixels).
void ClampGlobalSDFTextureGradientUV(const GlobalSDFData data, uint cascade, float texelOffset, inout float3 textureUV)
{
float cascadeSizeUV = 1.0f / data.CascadesCount;
- float cascadeUVStart = cascadeSizeUV * cascade + texelOffset;
- float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 3;
+ float cascadeUVStart = cascadeSizeUV * cascade + texelOffset * 2;
+ float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 4;
textureUV.x = clamp(textureUV.x, cascadeUVStart, cascadeUVEnd);
}
@@ -144,13 +151,13 @@ float SampleGlobalSDF(const GlobalSDFData data, Texture3D tex, Text
startCascade = min(startCascade, data.CascadesCount - 1);
for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
{
- float3 cascadeUV, textureUV;
- GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
+ float3 cascadeUV, textureUV, textureMipUV;
+ GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
float voxelSize = data.CascadeVoxelSize[cascade];
float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
- float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0);
+ float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0);
if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
{
distance = distanceMip * maxDistanceMip;
@@ -208,13 +215,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D
startCascade = min(startCascade, data.CascadesCount - 1);
for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
{
- float3 cascadeUV, textureUV;
- GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
+ float3 cascadeUV, textureUV, textureMipUV;
+ GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
float voxelSize = data.CascadeVoxelSize[cascade];
float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
- float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
+ float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip;
if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
{
float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
@@ -236,13 +243,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D
{
distance = distanceMip;
float texelOffset = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR / data.Resolution;
- ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
- float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
- float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
- float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
- float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
- float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
- float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
+ ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureMipUV);
+ float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x + texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
+ float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x - texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
+ float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y + texelOffset, textureMipUV.z), 0).x;
+ float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y - texelOffset, textureMipUV.z), 0).x;
+ float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z + texelOffset), 0).x;
+ float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z - texelOffset), 0).x;
gradient = float3(xp - xn, yp - yn, zp - zn) * maxDistanceMip;
}
break;
@@ -290,59 +297,32 @@ GlobalSDFHit RayTraceGlobalSDF(const GlobalSDFData data, Texture3D
float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
LOOP
- for (; step < 250 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
+ for (; step < 100 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
{
float3 stepPosition = trace.WorldPosition + trace.WorldDirection * stepTime;
- float stepScale = trace.StepScale;
// Sample SDF
- float stepDistance, voxelSizeScale = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR;
- float3 cascadeUV, textureUV;
- GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV);
- float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
- if (distanceMip < chunkSize)
- {
- stepDistance = distanceMip;
- float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
- if (distanceTex < chunkMargin)
- {
- stepDistance = distanceTex;
- voxelSizeScale = 1.0f;
- stepScale *= 0.63f; // Perform smaller steps nearby geometry
- }
- }
- else
- {
- // Assume no SDF nearby so perform a jump to the next chunk
- stepDistance = chunkSize;
- voxelSizeScale = 1.0f;
- }
+ float stepDistance;
+ float3 cascadeUV, textureUV, textureMipUV;
+ GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV, textureMipUV);
+ stepDistance = min(mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip, chunkSize);
+ float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
+ FLATTEN
+ if (distanceTex < chunkMargin)
+ stepDistance = distanceTex;
// Detect surface hit
- float minSurfaceThickness = voxelSizeScale * voxelExtent * saturate(stepTime / voxelSize);
+ float minSurfaceThickness = voxelExtent * saturate(stepTime / voxelSize);
if (stepDistance < minSurfaceThickness)
{
// Surface hit
hit.HitTime = max(stepTime + stepDistance - minSurfaceThickness, 0.0f);
hit.HitCascade = cascade;
hit.HitSDF = stepDistance;
- if (trace.NeedsHitNormal)
- {
- // Calculate hit normal from SDF gradient
- float texelOffset = 1.0f / data.Resolution;
- ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
- float xp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
- float xn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
- float yp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
- float yn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
- float zp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
- float zn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
- hit.HitNormal = normalize(float3(xp - xn, yp - yn, zp - zn));
- }
}
// Move forward
- stepTime += max(stepDistance * stepScale, voxelSize);
+ stepTime += max(stepDistance, voxelSize);
}
hit.StepsCount += step;
}
diff --git a/Source/Shaders/GlobalSignDistanceField.shader b/Source/Shaders/GlobalSignDistanceField.shader
index 461dba08d..fe4bafda5 100644
--- a/Source/Shaders/GlobalSignDistanceField.shader
+++ b/Source/Shaders/GlobalSignDistanceField.shader
@@ -311,26 +311,39 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
viewRay = normalize(viewRay - ViewWorldPos);
trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
- trace.NeedsHitNormal = true;
GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);
// Debug draw
- float3 color = saturate(hit.StepsCount / 80.0f).xxx;
- if (!hit.IsHit())
- color.rg *= 0.4f;
-#if 0
- else
- {
+ float3 color = saturate(hit.StepsCount / 50.0f).xxx;
+ if (hit.IsHit())
+ {
+#if 1
+ float3 hitPosition = hit.GetHitPosition(trace);
+ float hitSDF;
+ float3 hitNormal = SampleGlobalSDFGradient(GlobalSDF, GlobalSDFTex, GlobalSDFMip, hitPosition, hitSDF, hit.HitCascade);
+#if 1
+ // Composite step count with SDF normals
+ //color.rgb *= saturate(normalize(hitNormal) * 0.5f + 0.7f) + 0.3f;
+ color = lerp(normalize(hitNormal) * 0.5f + 0.5f, 1 - color, saturate(hit.StepsCount / 80.0f));
+#else
// Debug draw SDF normals
- color.rgb = normalize(hit.HitNormal) * 0.5f + 0.5f;
- }
-#elif 1
+ color = normalize(hitNormal) * 0.5f + 0.5f;
+#endif
+#else
+ // Heatmap with step count
+ if (hit.StepsCount > 40)
+ color = float3(saturate(hit.StepsCount / 80.0f), 0, 0);
+ else if (hit.StepsCount > 20)
+ color = float3(saturate(hit.StepsCount / 40.0f).xx, 0);
+ else
+ color = float3(0, saturate(hit.StepsCount / 20.0f), 0);
+#endif
+ }
else
{
- // Composite with SDF normals
- color.rgb *= saturate(normalize(hit.HitNormal) * 0.5f + 0.7f) + 0.1f;
+ // Bluish sky
+ color.rg *= 0.4f;
}
-#endif
return float4(color, 1);
}
diff --git a/Source/Shaders/Noise.hlsl b/Source/Shaders/Noise.hlsl
index dc35f1efc..df5a041fa 100644
--- a/Source/Shaders/Noise.hlsl
+++ b/Source/Shaders/Noise.hlsl
@@ -54,6 +54,26 @@ float2 PerlinNoiseFade(float2 t)
return t * t * t * (t * (t * 6.0 - 15.0) + 10.0);
}
+// "Next Generation Post Processing in Call of Duty: Advanced Warfare"
+// http://advances.realtimerendering.com/s2014/index.html
+float InterleavedGradientNoise(float2 uv, uint frameCount)
+{
+ const float2 magicFrameScale = float2(47, 17) * 0.695;
+ uv += frameCount * magicFrameScale;
+ const float3 magic = float3(0.06711056, 0.00583715, 52.9829189);
+ return frac(magic.z * frac(dot(uv, magic.xy)));
+}
+
+// Removes error from the color to properly store it in lower precision formats (error = 2^(-mantissaBits))
+float3 QuantizeColor(float3 color, float noise, float3 error)
+{
+ float3 delta = color * error;
+ delta.x = asfloat(asuint(delta.x) & ~0x007fffff);
+ delta.y = asfloat(asuint(delta.y) & ~0x007fffff);
+ delta.z = asfloat(asuint(delta.z) & ~0x007fffff);
+ return color + delta * noise;
+}
+
float rand2dTo1d(float2 value, float2 dotDir = float2(12.9898, 78.233))
{
// https://www.ronja-tutorials.com/post/024-white-noise/
diff --git a/Source/ThirdParty/meshoptimizer/allocator.cpp b/Source/ThirdParty/meshoptimizer/allocator.cpp
index 12eda3872..6b6083da2 100644
--- a/Source/ThirdParty/meshoptimizer/allocator.cpp
+++ b/Source/ThirdParty/meshoptimizer/allocator.cpp
@@ -1,8 +1,17 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
-void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+meshopt_Allocator::Storage& meshopt_Allocator::storage()
{
- meshopt_Allocator::Storage::allocate = allocate;
- meshopt_Allocator::Storage::deallocate = deallocate;
+ static Storage s = {::operator new, ::operator delete };
+ return s;
+}
+#endif
+
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+{
+ meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
+ s.allocate = allocate;
+ s.deallocate = deallocate;
}
diff --git a/Source/ThirdParty/meshoptimizer/clusterizer.cpp b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
index 52fe5a362..73cc0ab53 100644
--- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
@@ -6,19 +6,39 @@
#include
#include
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
+#define SIMD_SSE
+#include
+#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
+#define SIMD_NEON
+#include
+#endif
+#endif // !MESHOPTIMIZER_NO_SIMD
+
// This work is based on:
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
// Jack Ritter. An Efficient Bounding Sphere. 1990
+// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
+// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
namespace meshopt
{
-// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
-const size_t kMeshletMaxVertices = 255;
+// This must be <= 256 since meshlet indices are stored as bytes
+const size_t kMeshletMaxVertices = 256;
// A reasonable limit is around 2*max_vertices or less
const size_t kMeshletMaxTriangles = 512;
+// We keep a limited number of seed triangles and add a few triangles per finished meshlet
+const size_t kMeshletMaxSeeds = 256;
+const size_t kMeshletAddSeeds = 4;
+
+// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
+const int kMeshletMaxTreeDepth = 50;
+
struct TriangleAdjacency2
{
unsigned int* counts;
@@ -70,72 +90,190 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned
for (size_t i = 0; i < vertex_count; ++i)
{
assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
adjacency.offsets[i] -= adjacency.counts[i];
}
}
-static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
{
- assert(count > 0);
+ size_t face_count = index_count / 3;
- // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
- size_t pmin[3] = {0, 0, 0};
- size_t pmax[3] = {0, 0, 0};
+ // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
+ const unsigned int sparse_seen = 1u << 31;
+ assert(index_count < sparse_seen);
+
+ // allocate arrays
+ adjacency.counts = allocator.allocate(vertex_count);
+ adjacency.offsets = allocator.allocate(vertex_count);
+ adjacency.data = allocator.allocate(index_count);
+
+ // fill triangle counts
+ for (size_t i = 0; i < index_count; ++i)
+ assert(indices[i] < vertex_count);
+
+ for (size_t i = 0; i < index_count; ++i)
+ adjacency.counts[indices[i]] = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ adjacency.counts[indices[i]]++;
+
+ // fill offset table; uses sparse_seen bit to tag visited vertices
+ unsigned int offset = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int v = indices[i];
+
+ if ((adjacency.counts[v] & sparse_seen) == 0)
+ {
+ adjacency.offsets[v] = offset;
+ offset += adjacency.counts[v];
+ adjacency.counts[v] |= sparse_seen;
+ }
+ }
+
+ assert(offset == index_count);
+
+ // fill triangle data
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+ adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+ }
+
+ // fix offsets that have been disturbed by the previous pass
+ // also fix counts (that were marked with sparse_seen by the first pass)
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int v = indices[i];
+
+ if (adjacency.counts[v] & sparse_seen)
+ {
+ adjacency.counts[v] &= ~sparse_seen;
+
+ assert(adjacency.offsets[v] >= adjacency.counts[v]);
+ adjacency.offsets[v] -= adjacency.counts[v];
+ }
+ }
+}
+
+static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
+{
+ // for sparse inputs, it's faster to only clear vertices referenced by the index buffer
+ if (vertex_count <= index_count)
+ memset(used, -1, vertex_count * sizeof(short));
+ else
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ assert(indices[i] < vertex_count);
+ used[indices[i]] = -1;
+ }
+}
+
+static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
+{
+ static const float kAxes[7][3] = {
+ // X, Y, Z
+ {1, 0, 0},
+ {0, 1, 0},
+ {0, 0, 1},
+
+ // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
+ {0.57735026f, 0.57735026f, 0.57735026f},
+ {-0.57735026f, 0.57735026f, 0.57735026f},
+ {0.57735026f, -0.57735026f, 0.57735026f},
+ {0.57735026f, 0.57735026f, -0.57735026f},
+ };
+
+ assert(count > 0);
+ assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
+
+ size_t points_stride_float = points_stride / sizeof(float);
+ size_t radii_stride_float = radii_stride / sizeof(float);
+
+ // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
+ size_t pmin[7], pmax[7];
+ float tmin[7], tmax[7];
+
+ for (size_t axis = 0; axis < axis_count; ++axis)
+ {
+ pmin[axis] = pmax[axis] = 0;
+ tmin[axis] = FLT_MAX;
+ tmax[axis] = -FLT_MAX;
+ }
for (size_t i = 0; i < count; ++i)
{
- const float* p = points[i];
+ const float* p = points + i * points_stride_float;
+ float r = radii[i * radii_stride_float];
- for (int axis = 0; axis < 3; ++axis)
+ for (size_t axis = 0; axis < axis_count; ++axis)
{
- pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
- pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+ const float* ax = kAxes[axis];
+
+ float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
+ float tpmin = tp - r, tpmax = tp + r;
+
+ pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
+ pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
+ tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
+ tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
}
}
// find the pair of points with largest distance
- float paxisd2 = 0;
- int paxis = 0;
+ size_t paxis = 0;
+ float paxisdr = 0;
- for (int axis = 0; axis < 3; ++axis)
+ for (size_t axis = 0; axis < axis_count; ++axis)
{
- const float* p1 = points[pmin[axis]];
- const float* p2 = points[pmax[axis]];
+ const float* p1 = points + pmin[axis] * points_stride_float;
+ const float* p2 = points + pmax[axis] * points_stride_float;
+ float r1 = radii[pmin[axis] * radii_stride_float];
+ float r2 = radii[pmax[axis] * radii_stride_float];
float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+ float dr = sqrtf(d2) + r1 + r2;
- if (d2 > paxisd2)
+ if (dr > paxisdr)
{
- paxisd2 = d2;
+ paxisdr = dr;
paxis = axis;
}
}
// use the longest segment as the initial sphere diameter
- const float* p1 = points[pmin[paxis]];
- const float* p2 = points[pmax[paxis]];
+ const float* p1 = points + pmin[paxis] * points_stride_float;
+ const float* p2 = points + pmax[paxis] * points_stride_float;
+ float r1 = radii[pmin[paxis] * radii_stride_float];
+ float r2 = radii[pmax[paxis] * radii_stride_float];
- float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
- float radius = sqrtf(paxisd2) / 2;
+ float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
+ float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
+
+ float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
+ float radius = paxisdr / 2;
// iteratively adjust the sphere up until all points fit
for (size_t i = 0; i < count; ++i)
{
- const float* p = points[i];
+ const float* p = points + i * points_stride_float;
+ float r = radii[i * radii_stride_float];
+
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+ float d = sqrtf(d2);
- if (d2 > radius * radius)
+ if (d + r > radius)
{
- float d = sqrtf(d2);
- assert(d > 0);
+ float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
- float k = 0.5f + (radius / d) / 2;
-
- center[0] = center[0] * k + p[0] * (1 - k);
- center[1] = center[1] * k + p[1] * (1 - k);
- center[2] = center[2] * k + p[2] * (1 - k);
- radius = (radius + d) / 2;
+ center[0] += k * (p[0] - center[0]);
+ center[1] += k * (p[1] - center[1]);
+ center[2] += k * (p[2] - center[2]);
+ radius = (radius + d + r) / 2;
}
}
@@ -151,12 +289,12 @@ struct Cone
float nx, ny, nz;
};
-static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
{
float cone = 1.f - spread * cone_weight;
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
- return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+ return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
}
static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
@@ -221,72 +359,61 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices,
return mesh_area;
}
-static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
{
- size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
-
- // fill 4b padding with 0
- while (offset & 3)
- meshlet_triangles[offset++] = 0;
-}
-
-static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
-{
- unsigned char& av = used[a];
- unsigned char& bv = used[b];
- unsigned char& cv = used[c];
+ short& av = used[a];
+ short& bv = used[b];
+ short& cv = used[c];
bool result = false;
- unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+ int used_extra = (av < 0) + (bv < 0) + (cv < 0);
- if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+ if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
{
meshlets[meshlet_offset] = meshlet;
for (size_t j = 0; j < meshlet.vertex_count; ++j)
- used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
-
- finishMeshlet(meshlet, meshlet_triangles);
+ used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
meshlet.vertex_offset += meshlet.vertex_count;
- meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+ meshlet.triangle_offset += meshlet.triangle_count * 3;
meshlet.vertex_count = 0;
meshlet.triangle_count = 0;
result = true;
}
- if (av == 0xff)
+ if (av < 0)
{
- av = (unsigned char)meshlet.vertex_count;
+ av = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
}
- if (bv == 0xff)
+ if (bv < 0)
{
- bv = (unsigned char)meshlet.vertex_count;
+ bv = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
}
- if (cv == 0xff)
+ if (cv < 0)
{
- cv = (unsigned char)meshlet.vertex_count;
+ cv = short(meshlet.vertex_count);
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
}
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
- meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
+ meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
meshlet.triangle_count++;
return result;
}
-static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
{
unsigned int best_triangle = ~0u;
- unsigned int best_extra = 5;
+ int best_priority = 5;
float best_score = FLT_MAX;
for (size_t i = 0; i < meshlet.vertex_count; ++i)
@@ -301,61 +428,159 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
unsigned int triangle = neighbors[j];
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
- unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+ int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+ assert(extra <= 2);
+
+ int priority = -1;
// triangles that don't add new vertices to meshlets are max. priority
- if (extra != 0)
- {
- // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
- if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
- extra = 0;
-
- extra++;
- }
+ if (extra == 0)
+ priority = 0;
+ // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+ else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+ priority = 1;
+ // if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
+ else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
+ priority = 1 + extra;
+ // otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
+ else
+ priority = 2 + extra;
// since topology-based priority is always more important than the score, we can skip scoring in some cases
- if (extra > best_extra)
+ if (priority > best_priority)
continue;
- float score = 0;
+ const Cone& tri_cone = triangles[triangle];
- // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
- if (meshlet_cone)
- {
- const Cone& tri_cone = triangles[triangle];
+ float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
+ float distance = sqrtf(dx * dx + dy * dy + dz * dz);
+ float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
- float distance2 =
- (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
- (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
- (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
-
- float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
-
- score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
- }
- else
- {
- // each live_triangles entry is >= 1 since it includes the current triangle we're processing
- score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
- }
+ float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
// note that topology-based priority is always more important than the score
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
- if (extra < best_extra || score < best_score)
+ if (priority < best_priority || score < best_score)
{
best_triangle = triangle;
- best_extra = extra;
+ best_priority = priority;
best_score = score;
}
}
}
- if (out_extra)
- *out_extra = best_extra;
-
return best_triangle;
}
+static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+ unsigned int best_seeds[kMeshletAddSeeds];
+ unsigned int best_live[kMeshletAddSeeds];
+ float best_score[kMeshletAddSeeds];
+
+ for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+ {
+ best_seeds[i] = ~0u;
+ best_live[i] = ~0u;
+ best_score[i] = FLT_MAX;
+ }
+
+ for (size_t i = 0; i < meshlet.vertex_count; ++i)
+ {
+ unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+ unsigned int best_neighbor = ~0u;
+ unsigned int best_neighbor_live = ~0u;
+
+ // find the neighbor with the smallest live metric
+ unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+ size_t neighbors_size = adjacency.counts[index];
+
+ for (size_t j = 0; j < neighbors_size; ++j)
+ {
+ unsigned int triangle = neighbors[j];
+ unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+ unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+
+ if (live < best_neighbor_live)
+ {
+ best_neighbor = triangle;
+ best_neighbor_live = live;
+ }
+ }
+
+ // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
+ if (best_neighbor == ~0u)
+ continue;
+
+ float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
+ float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ for (size_t j = 0; j < kMeshletAddSeeds; ++j)
+ {
+ // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
+ if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
+ {
+ best_seeds[j] = best_neighbor;
+ best_live[j] = best_neighbor_live;
+ best_score[j] = best_neighbor_score;
+ break;
+ }
+ }
+ }
+
+ // add surviving seeds to the meshlet
+ size_t seed_count = 0;
+
+ for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+ if (best_seeds[i] != ~0u)
+ seeds[seed_count++] = best_seeds[i];
+
+ return seed_count;
+}
+
+static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
+{
+ size_t result = 0;
+
+ for (size_t i = 0; i < seed_count; ++i)
+ {
+ unsigned int index = seeds[i];
+
+ seeds[result] = index;
+ result += emitted_flags[index] == 0;
+ }
+
+ return result;
+}
+
+static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+ unsigned int best_seed = ~0u;
+ unsigned int best_live = ~0u;
+ float best_score = FLT_MAX;
+
+ for (size_t i = 0; i < seed_count; ++i)
+ {
+ unsigned int index = seeds[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+ float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
+ float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (live < best_live || (live == best_live && score < best_score))
+ {
+ best_seed = index;
+ best_live = live;
+ best_score = score;
+ }
+ }
+
+ return best_seed;
+}
+
struct KDNode
{
union
@@ -364,13 +589,13 @@ struct KDNode
unsigned int index;
};
- // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+ // leaves: axis = 3, children = number of points including this one
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
unsigned int axis : 2;
unsigned int children : 30;
};
-static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
{
size_t m = 0;
@@ -400,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
result.index = indices[0];
result.axis = 3;
- result.children = unsigned(count - 1);
+ result.children = unsigned(count);
// all remaining points are stored in nodes immediately following the leaf
for (size_t i = 1; i < count; ++i)
@@ -415,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
return offset + count;
}
-static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
{
assert(count > 0);
assert(offset < node_count);
@@ -441,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
}
// split axis is one where the variance is largest
- unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+ int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
float split = mean[axis];
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
// when the partition is degenerate simply consolidate the points into a single node
- if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+ // this also ensures recursion depth is bounded on pathological inputs
+ if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
KDNode& result = nodes[offset];
@@ -456,35 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
result.axis = axis;
// left subtree is right after our node
- size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+ size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
// distance to the right subtree is represented explicitly
+ assert(next_offset - offset > 1);
result.children = unsigned(next_offset - offset - 1);
- return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+ return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
}
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
{
const KDNode& node = nodes[root];
+ if (node.children == 0)
+ return;
+
if (node.axis == 3)
{
// leaf
- for (unsigned int i = 0; i <= node.children; ++i)
+ bool inactive = true;
+
+ for (unsigned int i = 0; i < node.children; ++i)
{
unsigned int index = nodes[root + i].index;
if (emitted_flags[index])
continue;
+ inactive = false;
+
const float* point = points + index * stride;
- float distance2 =
- (point[0] - position[0]) * (point[0] - position[0]) +
- (point[1] - position[1]) * (point[1] - position[1]) +
- (point[2] - position[2]) * (point[2] - position[2]);
- float distance = sqrtf(distance2);
+ float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
+ float distance = sqrtf(dx * dx + dy * dy + dz * dz);
if (distance < limit)
{
@@ -492,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
limit = distance;
}
}
+
+ // deactivate leaves that no longer have items to emit
+ if (inactive)
+ nodes[root].children = 0;
}
else
{
@@ -500,6 +735,12 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
unsigned int first = (delta <= 0) ? 0 : node.children;
unsigned int second = first ^ node.children;
+ // deactivate branches that no longer have items to emit to accelerate traversal
+ // note that we do this *before* recursing which delays deactivation but keeps tail calls
+ if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
+ nodes[root].children = 0;
+
+ // recursion depth is bounded by tree depth (which is limited by construction)
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
// only process the other node if it can have a match based on closest distance so far
@@ -508,6 +749,380 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
}
}
+struct BVHBoxT
+{
+ float min[4];
+ float max[4];
+};
+
+struct BVHBox
+{
+ float min[3];
+ float max[3];
+};
+
+#if defined(SIMD_SSE)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ __m128 min = _mm_loadu_ps(box.min);
+ __m128 max = _mm_loadu_ps(box.max);
+
+ // note: over-read is safe because BVHBox array is allocated with padding
+ min = _mm_min_ps(min, _mm_loadu_ps(other.min));
+ max = _mm_max_ps(max, _mm_loadu_ps(other.max));
+
+ _mm_storeu_ps(box.min, min);
+ _mm_storeu_ps(box.max, max);
+
+ __m128 size = _mm_sub_ps(max, min);
+ __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
+ __m128 mul = _mm_mul_ps(size, size_yzx);
+ __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
+ __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
+
+ return _mm_cvtss_f32(sum_xyz);
+}
+#elif defined(SIMD_NEON)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ float32x4_t min = vld1q_f32(box.min);
+ float32x4_t max = vld1q_f32(box.max);
+
+ // note: over-read is safe because BVHBox array is allocated with padding
+ min = vminq_f32(min, vld1q_f32(other.min));
+ max = vmaxq_f32(max, vld1q_f32(other.max));
+
+ vst1q_f32(box.min, min);
+ vst1q_f32(box.max, max);
+
+ float32x4_t size = vsubq_f32(max, min);
+ float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
+ float32x4_t mul = vmulq_f32(size, size_yzx);
+ float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
+ float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
+
+ return sum_xyz;
+}
+#else
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+ for (int k = 0; k < 3; ++k)
+ {
+ box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
+ box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
+ }
+
+ float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
+ return sx * sy + sx * sz + sy * sz;
+}
+#endif
+
+inline unsigned int radixFloat(unsigned int v)
+{
+ // if sign bit is 0, flip sign bit
+ // if sign bit is 1, flip everything
+ unsigned int mask = (int(v) >> 31) | 0x80000000;
+ return v ^ mask;
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
+{
+ memset(hist, 0, sizeof(hist));
+
+ const unsigned int* bits = reinterpret_cast(data);
+
+ // compute 3 10-bit histograms in parallel (dropping 2 LSB)
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = radixFloat(bits[i]);
+
+ hist[(id >> 2) & 1023][0]++;
+ hist[(id >> 12) & 1023][1]++;
+ hist[(id >> 22) & 1023][2]++;
+ }
+
+ unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
+
+ // replace histogram data with prefix histogram sums in-place
+ for (int i = 0; i < 1024; ++i)
+ {
+ unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+ hist[i][0] = sum0;
+ hist[i][1] = sum1;
+ hist[i][2] = sum2;
+
+ sum0 += hx;
+ sum1 += hy;
+ sum2 += hz;
+ }
+
+ assert(sum0 == count && sum1 == count && sum2 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+ const unsigned int* bits = reinterpret_cast(keys);
+ int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
+
+ destination[hist[id][pass]++] = source[i];
+ }
+}
+
+static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
+{
+ (void)vertex_count;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ const float* va = vertex_positions + vertex_stride_float * a;
+ const float* vb = vertex_positions + vertex_stride_float * b;
+ const float* vc = vertex_positions + vertex_stride_float * c;
+
+ BVHBox& box = boxes[i];
+
+ for (int k = 0; k < 3; ++k)
+ {
+ box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
+ box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
+
+ box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
+ box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
+
+ centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
+ }
+ }
+}
+
+static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
+{
+ // count number of unique vertices
+ size_t used_vertices = 0;
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int index = order[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+ used[a] = used[b] = used[c] = 1;
+
+ if (out)
+ out[i] = unsigned(used_vertices);
+ }
+
+ // reset used[] for future invocations
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int index = order[i];
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ used[a] = used[b] = used[c] = -1;
+ }
+
+ return used_vertices;
+}
+
+static void bvhPackLeaf(unsigned char* boundary, size_t count)
+{
+ // mark meshlet boundary for future reassembly
+ assert(count > 0);
+
+ boundary[0] = 1;
+ memset(boundary + 1, 0, count - 1);
+}
+
+static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
+{
+ for (size_t i = 0; i < count;)
+ {
+ size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
+
+ if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
+ {
+ bvhPackLeaf(boundary + i, chunk);
+ i += chunk;
+ continue;
+ }
+
+ // chunk is vertex bound, split it into smaller meshlets
+ assert(chunk > max_vertices / 3);
+
+ bvhPackLeaf(boundary + i, max_vertices / 3);
+ i += max_vertices / 3;
+ }
+}
+
+static bool bvhDivisible(size_t count, size_t min, size_t max)
+{
+ // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
+ // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
+ // we avoid expensive integer divisions in the common case where min is <= max/2
+ return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
+}
+
+static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
+{
+ BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
+ BVHBoxT accumr = accuml;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ float larea = boxMerge(accuml, boxes[order[i]]);
+ float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
+
+ areas[i] = larea;
+ areas[i + count] = rarea;
+ }
+}
+
+static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
+{
+ bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
+ size_t end = aligned ? count - min : count - 1;
+
+ float rmaxfill = 1.f / float(int(maxfill));
+
+ // find best split that minimizes SAH
+ size_t bestsplit = 0;
+ float bestcost = FLT_MAX;
+
+ for (size_t i = min - 1; i < end; i += step)
+ {
+ size_t lsplit = i + 1, rsplit = count - (i + 1);
+
+ if (!bvhDivisible(lsplit, min, max))
+ continue;
+ if (aligned && !bvhDivisible(rsplit, min, max))
+ continue;
+
+ // areas[x] = inclusive surface area of boxes[0..x]
+ // areas[count-1-x] = inclusive surface area of boxes[x..count-1]
+ float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
+ float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
+
+ if (cost > bestcost)
+ continue;
+
+ // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
+ // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
+ size_t lfill = vertices ? vertices[i] : lsplit;
+ size_t rfill = vertices ? vertices[i] : rsplit;
+
+ // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
+ int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
+ int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
+
+ cost += fill * (float(lrest) * larea + float(rrest) * rarea);
+
+ if (cost < bestcost)
+ {
+ bestcost = cost;
+ bestsplit = i + 1;
+ }
+ }
+
+ *out_cost = bestcost;
+ return bestsplit;
+}
+
+static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+ size_t l = 0, r = split;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned char side = sides[order[i]];
+ target[side ? r : l] = order[i];
+ l += 1;
+ l -= side;
+ r += side;
+ }
+
+ assert(l == split && r == count);
+}
+
+static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
+ return bvhPackLeaf(boundary, count);
+
+ unsigned int* axes[3] = {orderx, ordery, orderz};
+
+ // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
+ size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
+
+ // if we could not pack the meshlet, we must be vertex bound
+ size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
+ size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
+
+ // find best split that minimizes SAH
+ int bestk = -1;
+ size_t bestsplit = 0;
+ float bestcost = FLT_MAX;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ float* areas = static_cast(scratch);
+ unsigned int* vertices = NULL;
+
+ bvhComputeArea(areas, boxes, axes[k], count);
+
+ if (count <= max_triangles)
+ {
+ // for vertex bound clusters, count number of unique vertices for each split
+ vertices = reinterpret_cast(areas + 2 * count);
+ bvhCountVertices(axes[k], count, used, indices, vertices);
+ }
+
+ float axiscost = FLT_MAX;
+ size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
+
+ if (axissplit && axiscost < bestcost)
+ {
+ bestk = k;
+ bestcost = axiscost;
+ bestsplit = axissplit;
+ }
+ }
+
+ // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
+ if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
+ return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
+
+ // mark sides of split for partitioning
+ unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int);
+
+ for (size_t i = 0; i < bestsplit; ++i)
+ sides[axes[bestk][i]] = 0;
+
+ for (size_t i = bestsplit; i < count; ++i)
+ sides[axes[bestk][i]] = 1;
+
+ // partition all axes into two sides, maintaining order
+ unsigned int* temp = static_cast(scratch);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ if (k == bestk)
+ continue;
+
+ unsigned int* axis = axes[k];
+ memcpy(temp, axis, sizeof(unsigned int) * count);
+ bvhPartition(axis, temp, sides, bestsplit, count);
+ }
+
+ // recursion depth is bounded due to max depth check above
+ bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+ bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
} // namespace meshopt
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
@@ -517,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
assert(index_count % 3 == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
(void)kMeshletMaxVertices;
(void)kMeshletMaxTriangles;
@@ -532,7 +1146,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
}
-size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
{
using namespace meshopt;
@@ -541,18 +1155,24 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
assert(vertex_positions_stride % sizeof(float) == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
- assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+ assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
assert(cone_weight >= 0 && cone_weight <= 1);
+ assert(split_factor >= 0);
+
+ if (index_count == 0)
+ return 0;
meshopt_Allocator allocator;
TriangleAdjacency2 adjacency = {};
- buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+ if (vertex_count > index_count && index_count < (1u << 31))
+ buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
+ else
+ buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
- unsigned int* live_triangles = allocator.allocate(vertex_count);
- memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+ // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+ unsigned int* live_triangles = adjacency.counts;
size_t face_count = index_count / 3;
@@ -573,11 +1193,45 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
kdindices[i] = unsigned(i);
KDNode* nodes = allocator.allocate(face_count * 2);
- kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
+ kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
- // index of the vertex in the meshlet, 0xff if the vertex isn't used
- unsigned char* used = allocator.allocate(vertex_count);
- memset(used, -1, vertex_count);
+ // find a specific corner of the mesh to use as a starting point for meshlet flow
+ float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ const Cone& tri = triangles[i];
+
+ cornerx = cornerx > tri.px ? tri.px : cornerx;
+ cornery = cornery > tri.py ? tri.py : cornery;
+ cornerz = cornerz > tri.pz ? tri.pz : cornerz;
+ }
+
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
+
+ // initial seed triangle is the one closest to the corner
+ unsigned int initial_seed = ~0u;
+ float initial_score = FLT_MAX;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ const Cone& tri = triangles[i];
+
+ float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
+ float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (initial_seed == ~0u || score < initial_score)
+ {
+ initial_seed = unsigned(i);
+ initial_score = score;
+ }
+ }
+
+ // seed triangles to continue meshlet flow
+ unsigned int seeds[kMeshletMaxSeeds] = {};
+ size_t seed_count = 0;
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
@@ -588,46 +1242,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
{
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
- unsigned int best_extra = 0;
- unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
+ unsigned int best_triangle = ~0u;
- // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
- if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
- {
- best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
- }
+ // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
+ // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
+ if (meshlet_offset == 0 && meshlet.triangle_count == 0)
+ best_triangle = initial_seed;
+ else
+ best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
- // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
+ bool split = false;
+
+ // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
if (best_triangle == ~0u)
{
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
unsigned int index = ~0u;
- float limit = FLT_MAX;
+ float distance = FLT_MAX;
- kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+ kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
best_triangle = index;
+ split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
}
if (best_triangle == ~0u)
break;
+ int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
+
+ // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
+ if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
+ {
+ seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
+ seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
+ seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
+
+ unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
+
+ // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
+ best_triangle = best_seed != ~0u ? best_seed : best_triangle;
+ }
+
unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
- if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
+ if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
{
meshlet_offset++;
memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
}
- live_triangles[a]--;
- live_triangles[b]--;
- live_triangles[c]--;
-
// remove emitted triangle from adjacency data
// this makes sure that we spend less time traversing these lists on subsequent iterations
+ // live triangle counts are updated as a byproduct of these adjustments
for (size_t k = 0; k < 3; ++k)
{
unsigned int index = indices[best_triangle * 3 + k];
@@ -656,20 +1325,23 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
meshlet_cone_acc.ny += triangles[best_triangle].ny;
meshlet_cone_acc.nz += triangles[best_triangle].nz;
+ assert(!emitted_flags[best_triangle]);
emitted_flags[best_triangle] = 1;
}
if (meshlet.triangle_count)
- {
- finishMeshlet(meshlet, meshlet_triangles);
-
meshlets[meshlet_offset++] = meshlet;
- }
- assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+ assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
return meshlet_offset;
}
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+ return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
+}
+
size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
{
using namespace meshopt;
@@ -678,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
- assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
meshopt_Allocator allocator;
- // index of the vertex in the meshlet, 0xff if the vertex isn't used
- unsigned char* used = allocator.allocate(vertex_count);
- memset(used, -1, vertex_count);
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
@@ -699,13 +1370,109 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
}
if (meshlet.triangle_count)
- {
- finishMeshlet(meshlet, meshlet_triangles);
-
meshlets[meshlet_offset++] = meshlet;
- }
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
+ return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+ assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
+
+ if (index_count == 0)
+ return 0;
+
+ size_t face_count = index_count / 3;
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ meshopt_Allocator allocator;
+
+ // 3 floats plus 1 uint for sorting, or
+ // 2 floats plus 1 uint for pivoting, or
+ // 1 uint plus 1 byte for partitioning
+ float* scratch = allocator.allocate(face_count * 4);
+
+ // compute bounding boxes and centroids for sorting
+ BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD
+ bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
+ memset(boxes + face_count, 0, sizeof(BVHBox));
+
+ unsigned int* axes = allocator.allocate(face_count * 3);
+ unsigned int* temp = reinterpret_cast(scratch) + face_count * 3;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ unsigned int* order = axes + k * face_count;
+ const float* keys = scratch + k * face_count;
+
+ unsigned int hist[1024][3];
+ computeHistogram(hist, keys, face_count);
+
+ // 3-pass radix sort computes the resulting order into axes
+ for (size_t i = 0; i < face_count; ++i)
+ temp[i] = unsigned(i);
+
+ radixPass(order, temp, keys, face_count, hist, 0);
+ radixPass(temp, order, keys, face_count, hist, 1);
+ radixPass(order, temp, keys, face_count, hist, 2);
+ }
+
+ // index of the vertex in the meshlet, -1 if the vertex isn't used
+ short* used = allocator.allocate(vertex_count);
+ clearUsed(used, vertex_count, indices, index_count);
+
+ unsigned char* boundary = allocator.allocate(face_count);
+
+ bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+
+ // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
+ size_t meshlet_count = 0;
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ assert(boundary[i] <= 1);
+ meshlet_count += boundary[i];
+ }
+
+ size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
+
+ // pack triangles into meshlets according to the order and boundaries marked by bvhSplit
+ meshopt_Meshlet meshlet = {};
+ size_t meshlet_offset = 0;
+ size_t meshlet_pending = meshlet_count;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ assert(boundary[i] <= 1);
+ bool split = i > 0 && boundary[i] == 1;
+
+ // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
+ if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
+ split = false;
+
+ unsigned int index = axes[i];
+ assert(index < face_count);
+
+ unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+ // appends triangle to the meshlet and writes previous meshlet to the output if full
+ meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
+ meshlet_pending -= boundary[i];
+ }
+
+ if (meshlet.triangle_count)
+ meshlets[meshlet_offset++] = meshlet;
+
+ assert(meshlet_offset <= meshlet_bound);
+ assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
return meshlet_offset;
}
@@ -765,15 +1532,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
if (triangles == 0)
return bounds;
+ const float rzero = 0.f;
+
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
float psphere[4] = {};
- computeBoundingSphere(psphere, corners[0], triangles * 3);
+ computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
float center[3] = {psphere[0], psphere[1], psphere[2]};
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
float nsphere[4] = {};
- computeBoundingSphere(nsphere, normals, triangles);
+ computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@@ -883,6 +1652,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
}
+meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
+{
+ using namespace meshopt;
+
+ assert(positions_stride >= 12 && positions_stride <= 256);
+ assert(positions_stride % sizeof(float) == 0);
+ assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
+ assert(radii_stride % sizeof(float) == 0);
+
+ meshopt_Bounds bounds = {};
+
+ if (count == 0)
+ return bounds;
+
+ const float rzero = 0.f;
+
+ float psphere[4] = {};
+ computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
+
+ bounds.center[0] = psphere[0];
+ bounds.center[1] = psphere[1];
+ bounds.center[2] = psphere[2];
+ bounds.radius = psphere[3];
+
+ return bounds;
+}
+
void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
{
using namespace meshopt;
@@ -950,25 +1746,28 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
unsigned int order[kMeshletMaxVertices];
- unsigned char remap[kMeshletMaxVertices];
- memset(remap, -1, vertex_count);
+ short remap[kMeshletMaxVertices];
+ memset(remap, -1, vertex_count * sizeof(short));
size_t vertex_offset = 0;
for (size_t i = 0; i < triangle_count * 3; ++i)
{
- unsigned char& r = remap[indices[i]];
+ short& r = remap[indices[i]];
- if (r == 0xff)
+ if (r < 0)
{
- r = (unsigned char)(vertex_offset);
+ r = short(vertex_offset);
order[vertex_offset] = vertices[indices[i]];
vertex_offset++;
}
- indices[i] = r;
+ indices[i] = (unsigned char)r;
}
assert(vertex_offset <= vertex_count);
memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
diff --git a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
similarity index 58%
rename from Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
index 368274382..87ceeae66 100644
--- a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
@@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind
return result;
}
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ meshopt_VertexFetchStatistics result = {};
+
+ unsigned char* vertex_visited = allocator.allocate(vertex_count);
+ memset(vertex_visited, 0, vertex_count);
+
+ const size_t kCacheLine = 64;
+ const size_t kCacheSize = 128 * 1024;
+
+ // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+ size_t cache[kCacheSize / kCacheLine] = {};
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ vertex_visited[index] = 1;
+
+ size_t start_address = index * vertex_size;
+ size_t end_address = start_address + vertex_size;
+
+ size_t start_tag = start_address / kCacheLine;
+ size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+ assert(start_tag < end_tag);
+
+ for (size_t tag = start_tag; tag < end_tag; ++tag)
+ {
+ size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+ // we store +1 since cache is filled with 0 by default
+ result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+ cache[line] = tag + 1;
+ }
+ }
+
+ size_t unique_vertex_count = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ unique_vertex_count += vertex_visited[i];
+
+ result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+ return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/indexcodec.cpp b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
index b30046005..7a8fd6867 100644
--- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
const unsigned char kSequenceHeader = 0xd0;
static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;
typedef unsigned int VertexFifo[16];
typedef unsigned int EdgeFifo[16][2];
@@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
if (fer >= 0 && (fer >> 2) < 15)
{
+ // note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
const unsigned int* order = kTriangleIndexOrder[fer & 3];
unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
@@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+ // note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
int fea = (a == next) ? (next++, 0) : 15;
int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
@@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
void meshopt_encodeIndexVersion(int version)
{
- assert(unsigned(version) <= 1);
+ assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
meshopt::gEncodeIndexVersion = version;
}
+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+ if (buffer_size < 1)
+ return -1;
+
+ unsigned char header = buffer[0];
+
+ if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+ return -1;
+
+ int version = header & 0x0f;
+ if (version > meshopt::kDecodeIndexVersion)
+ return -1;
+
+ return version;
+}
+
int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
@@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
return -1;
int version = buffer[0] & 0x0f;
- if (version > 1)
+ if (version > kDecodeIndexVersion)
return -1;
EdgeFifo edgefifo;
@@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
// fifo reads are wrapped around 16 entry buffer
unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+ unsigned int c = 0;
int fec = codetri & 15;
@@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
{
// fifo reads are wrapped around 16 entry buffer
unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
- unsigned int c = (fec == 0) ? next : cf;
+ c = (fec == 0) ? next : cf;
int fec0 = fec == 0;
next += fec0;
- // output triangle
- writeTriangle(destination, i, index_size, a, b, c);
-
- // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
-
- pushEdgeFifo(edgefifo, c, b, edgefifooffset);
- pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
- unsigned int c = 0;
-
// fec - (fec ^ 3) decodes 13, 14 into -1, 1
// note that we need to update the last index since free indices are delta-encoded
last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
- // output triangle
- writeTriangle(destination, i, index_size, a, b, c);
-
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset);
-
- pushEdgeFifo(edgefifo, c, b, edgefifooffset);
- pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
+
+ // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
}
else
{
@@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
return -1;
int version = buffer[0] & 0x0f;
- if (version > 1)
+ if (version > kDecodeIndexVersion)
return -1;
const unsigned char* data = buffer + 1;
diff --git a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
index f6728345a..4bf9fccad 100644
--- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
@@ -5,7 +5,9 @@
#include
// This work is based on:
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
namespace meshopt
{
@@ -85,6 +87,46 @@ struct VertexStreamHasher
}
};
+struct VertexCustomHasher
+{
+ const float* vertex_positions;
+ size_t vertex_stride_float;
+
+ int (*callback)(void*, unsigned int, unsigned int);
+ void* context;
+
+ size_t hash(unsigned int index) const
+ {
+ const unsigned int* key = reinterpret_cast(vertex_positions + index * vertex_stride_float);
+
+ unsigned int x = key[0], y = key[1], z = key[2];
+
+ // replace negative zero with zero
+ x = (x == 0x80000000) ? 0 : x;
+ y = (y == 0x80000000) ? 0 : y;
+ z = (z == 0x80000000) ? 0 : z;
+
+ // scramble bits to make sure that integer coordinates have entropy in lower bits
+ x ^= x >> 17;
+ y ^= y >> 17;
+ z ^= z >> 17;
+
+ // Optimized Spatial Hashing for Collision Detection of Deformable Objects
+ return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ const float* lp = vertex_positions + lhs * vertex_stride_float;
+ const float* rp = vertex_positions + rhs * vertex_stride_float;
+
+ if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
+ return false;
+
+ return callback ? callback(context, lhs, rhs) : true;
+ }
+};
+
struct EdgeHasher
{
const unsigned int* remap;
@@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
allocator.deallocate(vertex_table);
}
+template
+static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices ? indices[i] : unsigned(i);
+ assert(index < vertex_count);
+
+ if (remap[index] != ~0u)
+ continue;
+
+ unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+ if (*entry == ~0u)
+ {
+ *entry = index;
+ remap[index] = next_vertex++;
+ }
+ else
+ {
+ assert(remap[*entry] != ~0u);
+ remap[index] = remap[*entry];
+ }
+ }
+
+ assert(next_vertex <= vertex_count);
+ return next_vertex;
+}
+
template
static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
{
@@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex
}
}
+template
+static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+ unsigned int* remap = allocator.allocate(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ if (remap[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+ if (*entry == ~0u)
+ *entry = index;
+
+ remap[index] = *entry;
+ }
+
+ destination[i] = remap[index];
+ }
+}
+
} // namespace meshopt
size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
-
- memset(destination, -1, vertex_count * sizeof(unsigned int));
-
VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_size};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
-
- unsigned int next_vertex = 0;
-
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices ? indices[i] : unsigned(i);
- assert(index < vertex_count);
-
- if (destination[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
- if (*entry == ~0u)
- {
- *entry = index;
-
- destination[index] = next_vertex++;
- }
- else
- {
- assert(destination[*entry] != ~0u);
-
- destination[index] = destination[*entry];
- }
- }
- }
-
- assert(next_vertex <= vertex_count);
-
- return next_vertex;
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
}
meshopt_Allocator allocator;
-
- memset(destination, -1, vertex_count * sizeof(unsigned int));
-
VertexStreamHasher hasher = {streams, stream_count};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
- unsigned int next_vertex = 0;
+size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
+{
+ using namespace meshopt;
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices ? indices[i] : unsigned(i);
- assert(index < vertex_count);
+ assert(indices || index_count == vertex_count);
+ assert(!indices || index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
- if (destination[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+ meshopt_Allocator allocator;
+ VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};
- if (*entry == ~0u)
- {
- *entry = index;
-
- destination[index] = next_vertex++;
- }
- else
- {
- assert(destination[*entry] != ~0u);
-
- destination[index] = destination[*entry];
- }
- }
- }
-
- assert(next_vertex <= vertex_count);
-
- return next_vertex;
+ return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
@@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned
assert(vertex_size <= vertex_stride);
meshopt_Allocator allocator;
-
- unsigned int* remap = allocator.allocate(vertex_count);
- memset(remap, -1, vertex_count * sizeof(unsigned int));
-
VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_stride};
- size_t table_size = hashBuckets(vertex_count);
- unsigned int* table = allocator.allocate(table_size);
- memset(table, -1, table_size * sizeof(unsigned int));
-
- for (size_t i = 0; i < index_count; ++i)
- {
- unsigned int index = indices[i];
- assert(index < vertex_count);
-
- if (remap[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
- if (*entry == ~0u)
- *entry = index;
-
- remap[index] = *entry;
- }
-
- destination[i] = remap[index];
- }
+ generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
}
meshopt_Allocator allocator;
-
- unsigned int* remap = allocator.allocate(vertex_count);
- memset(remap, -1, vertex_count * sizeof(unsigned int));
-
VertexStreamHasher hasher = {streams, stream_count};
+ generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+ VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
+
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
- for (size_t i = 0; i < index_count; ++i)
+ for (size_t i = 0; i < vertex_count; ++i)
{
- unsigned int index = indices[i];
- assert(index < vertex_count);
+ unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
- if (remap[index] == ~0u)
- {
- unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+ if (*entry == ~0u)
+ *entry = unsigned(i);
- if (*entry == ~0u)
- *entry = index;
-
- remap[index] = *entry;
- }
-
- destination[i] = remap[index];
+ destination[i] = *entry;
}
}
@@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
memcpy(destination + i * 4, patch, sizeof(patch));
}
}
+
+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+ assert(index_count % 3 == 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* remap = allocator.allocate(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ // compute vertex valence; this is used to prioritize least used corner
+ // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+ unsigned char* valence = allocator.allocate(vertex_count);
+ memset(valence, 0, vertex_count);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ valence[index]++;
+ }
+
+ unsigned int reorder_offset = 0;
+
+ // assign provoking vertices; leave the rest for the next pass
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ // try to rotate triangle such that provoking vertex hasn't been seen before
+ // if multiple vertices are new, prioritize the one with least valence
+ // this reduces the risk that a future triangle will have all three vertices seen
+ unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
+ unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
+ unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
+
+ if (vb != ~0u && vb <= va && vb <= vc)
+ {
+ // abc -> bca
+ unsigned int t = a;
+ a = b, b = c, c = t;
+ }
+ else if (vc != ~0u && vc <= va && vc <= vb)
+ {
+ // abc -> cab
+ unsigned int t = c;
+ c = b, b = a, a = t;
+ }
+
+ unsigned int newidx = reorder_offset;
+
+ // now remap[a] = ~0u or all three vertices are old
+ // recording remap[a] makes it possible to remap future references to the same index, conserving space
+ if (remap[a] == ~0u)
+ remap[a] = newidx;
+
+ // we need to clone the provoking vertex to get a unique index
+ // if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
+ reorder[reorder_offset++] = a;
+
+ // note: first vertex is final, the other two will be fixed up in next pass
+ destination[i + 0] = newidx;
+ destination[i + 1] = b;
+ destination[i + 2] = c;
+
+ // update vertex valences for corner heuristic
+ valence[a]--;
+ valence[b]--;
+ valence[c]--;
+ }
+
+ // remap or clone non-provoking vertices (iterating to skip provoking vertices)
+ int step = 1;
+
+ for (size_t i = 1; i < index_count; i += step, step ^= 3)
+ {
+ unsigned int index = destination[i];
+
+ if (remap[index] == ~0u)
+ {
+ // we haven't seen the vertex before as a provoking vertex
+ // to maintain the reference to the original vertex we need to clone it
+ unsigned int newidx = reorder_offset;
+
+ remap[index] = newidx;
+ reorder[reorder_offset++] = index;
+ }
+
+ destination[i] = remap[index];
+ }
+
+ assert(reorder_offset <= vertex_count + index_count / 3);
+ return reorder_offset;
+}
diff --git a/Source/ThirdParty/meshoptimizer/meshoptimizer.h b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
index 6c8dcd7e8..c9239bc30 100644
--- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h
+++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
/**
- * meshoptimizer - version 0.21
+ * meshoptimizer - version 1.0
*
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at https://github.com/zeux/meshoptimizer
*
* This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
#include
/* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
+#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */
/* If no API is defined, assume default */
#ifndef MESHOPTIMIZER_API
@@ -29,11 +29,14 @@
#endif
/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#ifndef MESHOPTIMIZER_EXPERIMENTAL
#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+#endif
/* C interface */
#ifdef __cplusplus
-extern "C" {
+extern "C"
+{
#endif
/**
@@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
*/
MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
+
/**
* Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
*
@@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
*/
MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+/**
+ * Generates a remap table that maps all vertices with the same position to the same (existing) index.
+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
/**
* Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
* Each triangle is converted into a 6-vertex patch with the following layout:
@@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
*/
MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
+ * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
+ * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
+ * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
+ * For maximum efficiency the input index buffer should be optimized for vertex cache first.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
/**
* Vertex transform cache optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
*/
@@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
* Vertex transform cache optimizer for FIFO caches
* Reorders indices to reduce the number of GPU vertex shader invocations
* Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* cache_size should be less than the actual GPU cache size to avoid cache thrashing
@@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
/**
* Overdraw optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
@@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
* Vertex fetch cache optimizer
* Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
* Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
- * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count elements)
* indices is used both as an input and as an output index buffer
@@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
/**
- * Set index encoder format version
+ * Set index encoder format version (defaults to 1)
+ *
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
*/
MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
@@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
*/
MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+/**
+ * Get encoded index format version
+ * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
+
/**
* Index sequence encoder
* Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
* Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
* This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
* Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
*
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
/**
- * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
+
+/**
+ * Set vertex encoder format version (defaults to 1)
+ *
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
*/
MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
@@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
* The decoder is safe to use for untrusted input, but it may produce garbage data.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ * vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+/**
+ * Get encoded vertex format version
+ * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
+
/**
* Vertex buffer filters
* These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
*
- * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
*
- * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
* Each component is stored as an 16-bit integer; stride must be equal to 8.
*
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ *
+ * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
/**
* Vertex buffer filter encoders
* These functions can be used to encode data in a format that meshopt_decodeFilter can decode
*
- * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
- * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
* Input data must contain 4 floats for every vector (count*4 total).
*
* meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
@@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
* meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
+ *
+ * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
+ * Input data must contain 4 floats for every color (count*4 total).
*/
enum meshopt_EncodeExpMode
{
@@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode
meshopt_EncodeExpSharedVector,
/* When encoding exponents, use shared value for each component of all vectors (best compression) */
meshopt_EncodeExpSharedComponent,
+ /* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+ meshopt_EncodeExpClamped,
};
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
/**
* Simplification options
@@ -334,16 +416,34 @@ enum
meshopt_SimplifySparse = 1 << 1,
/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
meshopt_SimplifyErrorAbsolute = 1 << 2,
+ /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
+ meshopt_SimplifyPrune = 1 << 3,
+ /* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
+ meshopt_SimplifyRegularize = 1 << 4,
+ /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
+ meshopt_SimplifyPermissive = 1 << 5,
+};
+
+/**
+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
+ */
+enum
+{
+ /* Do not move this vertex. */
+ meshopt_SimplifyVertex_Lock = 1 << 0,
+ /* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
+ meshopt_SimplifyVertex_Protect = 1 << 1,
};
/**
* Mesh simplifier
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
* Returns the number of indices after simplification, with destination containing new index data
+ *
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
@@ -354,45 +454,94 @@ enum
MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
- * Experimental: Mesh simplifier with attribute metric
- * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ * Mesh simplifier with attribute metric
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
*
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_attributes should have attribute_count floats for each vertex
- * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
- * attribute_count must be <= 16
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
* vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
- * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
- * Experimental: Mesh simplifier (sloppy)
+ * Mesh simplifier with position/attribute update
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, indices are destructively updated with new index data
+ *
+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
+/**
+ * Mesh simplifier (sloppy)
* Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
* The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
* Returns the number of indices after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
/**
- * Experimental: Point cloud simplifier
+ * Mesh simplifier (pruner)
+ * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+
+/**
+ * Point cloud simplifier
* Reduces the number of points in the cloud to reach the given target
* Returns the number of points after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer (target_vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * color_weight determines relative priority of color wrt position; 1.0 is a safe default
*/
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
+MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
/**
* Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics
*/
MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+struct meshopt_VertexFetchStatistics
+{
+ unsigned int bytes_fetched;
+ float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
struct meshopt_OverdrawStatistics
{
unsigned int pixels_covered;
@@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics
*/
MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
-struct meshopt_VertexFetchStatistics
+struct meshopt_CoverageStatistics
{
- unsigned int bytes_fetched;
- float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+ float coverage[3];
+ float extent; /* viewport size in mesh coordinates */
};
/**
- * Vertex fetch cache analyzer
- * Returns cache hit statistics using a simplified direct mapped model
- * Results may not match actual GPU performance
+ * Coverage analyzer
+ * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
-MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Meshlet is a small mesh cluster (subset) that consists of:
+ * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
+ * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
+ *
+ * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
+ */
struct meshopt_Meshlet
{
/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
unsigned int vertex_offset;
unsigned int triangle_offset;
- /* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+ /* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
unsigned int vertex_count;
unsigned int triangle_count;
};
@@ -484,14 +654,15 @@ struct meshopt_Meshlet
* Meshlet builder
* Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
* The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
* When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
* When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
*
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
*/
MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
/**
- * Experimental: Meshlet optimizer
- * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
+ * Meshlet builder with flexible cluster sizes
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
+ * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
*
- * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
- * need to be computed from meshlet's vertex_offset and triangle_offset
- * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
+ * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+
+/**
+ * Meshlet builder that produces clusters optimized for raytracing
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
+ *
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
+ */
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+
+/**
+ * Meshlet optimizer
+ * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
+ *
+ * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
struct meshopt_Bounds
{
@@ -544,11 +742,35 @@ struct meshopt_Bounds
* Real-Time Rendering 4th Edition, section 19.3).
*
* vertex_positions should have float3 position in the first 12 bytes of each vertex
- * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
+ * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
+ * index_count/3 and triangle_count must not exceed implementation limits (<= 512)
*/
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+/**
+ * Sphere bounds generator
+ * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
+ *
+ * positions should have float3 position in the first 12 bytes of each element
+ * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
+ */
+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
+
+/**
+ * Cluster partitioner
+ * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
+ * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
+ *
+ * destination must contain enough space for the resulting partition data (cluster_count elements)
+ * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
+ * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
+ * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
+ * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
+ */
+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+
/**
* Spatial sorter
* Generates a remap table that can be used to reorder points for spatial locality.
@@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig
MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
- * Experimental: Spatial sorter
+ * Spatial sorter
* Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Spatial clusterizer
+ * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
+ * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
+ *
+ * destination must contain enough space for the resulting index buffer (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
+
+/**
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
+ * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
/**
* Set allocation callbacks
@@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
* Note that all algorithms only allocate memory for temporary use.
* allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
*/
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
#ifdef __cplusplus
} /* extern "C" */
#endif
-/* Quantization into commonly supported data formats */
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
#ifdef __cplusplus
/**
* Quantize a float in [0..1] range into an N-bit fixed point unorm value
@@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N);
* Maximum reconstruction error: 1/2^N
*/
inline int meshopt_quantizeSnorm(float v, int N);
-
-/**
- * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Representable magnitude range: [6e-5; 65504]
- * Maximum relative reconstruction error: 5e-4
- */
-MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
-
-/**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Assumes N is in a valid mantissa precision range, which is 1..23
- */
-MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
-
-/**
- * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
- * Preserves Inf/NaN, flushes denormals to zero
- */
-MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
#endif
/**
@@ -631,6 +863,10 @@ template
inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
template
inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
template
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
template
@@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
template
inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
+template
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
template
inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -661,29 +899,44 @@ template
inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
template
inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
template
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template
inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
+template
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
template
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
+template
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+template
inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
template
inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
template
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+template
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
template
inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
template
inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
template
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+template
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+template
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+template
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
#endif
@@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N)
class meshopt_Allocator
{
public:
- template
- struct StorageT
+ struct Storage
{
- static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
- static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
+ void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
+ void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
};
- typedef StorageT Storage;
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+ MESHOPTIMIZER_API static Storage& storage();
+#else
+ static Storage& storage()
+ {
+ static Storage s = {::operator new, ::operator delete };
+ return s;
+ }
+#endif
meshopt_Allocator()
- : blocks()
- , count(0)
+ : blocks()
+ , count(0)
{
}
~meshopt_Allocator()
{
for (size_t i = count; i > 0; --i)
- Storage::deallocate(blocks[i - 1]);
+ storage().deallocate(blocks[i - 1]);
}
- template T* allocate(size_t size)
+ template
+ T* allocate(size_t size)
{
assert(count < sizeof(blocks) / sizeof(blocks[0]));
- T* result = static_cast(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+ T* result = static_cast(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
blocks[count++] = result;
return result;
}
@@ -749,7 +1010,7 @@ public:
void deallocate(void* ptr)
{
assert(count > 0 && blocks[count - 1] == ptr);
- Storage::deallocate(ptr);
+ storage().deallocate(ptr);
count--;
}
@@ -757,10 +1018,6 @@ private:
void* blocks[24];
size_t count;
};
-
-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
-template void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::allocate)(size_t) = operator new;
-template void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::deallocate)(void*) = operator delete;
#endif
/* Inline implementation for C++ templated wrappers */
@@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter
{
size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
- data = static_cast(meshopt_Allocator::Storage::allocate(size));
+ data = static_cast(meshopt_Allocator::storage().allocate(size));
if (input)
{
@@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter
result[i] = T(data[i]);
}
- meshopt_Allocator::Storage::deallocate(data);
+ meshopt_Allocator::storage().deallocate(data);
}
};
@@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const
return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
}
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+ struct Call
+ {
+ static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; }
+ };
+
+ return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
+template
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+ struct Call
+ {
+ static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; }
+ };
+
+ meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0);
+
+ return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
template
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
{
@@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
+template
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ size_t bound = vertex_count + (index_count / 3);
+ assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
+ (void)bound;
+
+ return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
+}
+
template
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
{
@@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
}
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
+{
+ return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
+}
+
template
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
{
@@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
}
+template
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+ meshopt_IndexAdapter inout(indices, indices, index_count);
+
+ return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
+}
+
template
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
meshopt_IndexAdapter out(destination, NULL, index_count);
- return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
+ return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
+}
+
+template
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
+}
+
+template
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+ meshopt_IndexAdapter out(destination, NULL, index_count);
+
+ return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
}
template
@@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
}
template
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
- return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+ return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
+}
+
+template
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
}
template
@@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
}
template
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
meshopt_IndexAdapter in(NULL, indices, index_count);
- return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+ return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
template
@@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
}
+template
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
+}
+
+template
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+ meshopt_IndexAdapter in(NULL, indices, index_count);
+
+ return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
template
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
+template
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+ meshopt_IndexAdapter in(NULL, cluster_indices, total_index_count);
+
+ return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size);
+}
+
template
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
#endif
/**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
diff --git a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
index cc22dbcff..682b924a9 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
namespace meshopt
{
-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float mesh_centroid[3] = {};
- for (size_t i = 0; i < index_count; ++i)
+ for (size_t i = 0; i < vertex_count; ++i)
{
- const float* p = vertex_positions + vertex_stride_float * indices[i];
+ const float* p = vertex_positions + vertex_stride_float * i;
mesh_centroid[0] += p[0];
mesh_centroid[1] += p[1];
mesh_centroid[2] += p[2];
}
- mesh_centroid[0] /= index_count;
- mesh_centroid[1] /= index_count;
- mesh_centroid[2] /= index_count;
+ mesh_centroid[0] /= float(vertex_count);
+ mesh_centroid[1] /= float(vertex_count);
+ mesh_centroid[2] /= float(vertex_count);
for (size_t cluster = 0; cluster < cluster_count; ++cluster)
{
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
// fill sort data
float* sort_data = allocator.allocate(cluster_count);
- calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+ calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
// sort clusters using sort data
unsigned short* sort_keys = allocator.allocate(cluster_count);
diff --git a/Source/ThirdParty/meshoptimizer/partition.cpp b/Source/ThirdParty/meshoptimizer/partition.cpp
new file mode 100644
index 000000000..4119a53ed
--- /dev/null
+++ b/Source/ThirdParty/meshoptimizer/partition.cpp
@@ -0,0 +1,624 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include
+#include
+#include
+
+// This work is based on:
+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
+namespace meshopt
+{
+
+// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
+const int kMergeDepthCutoff = 40;
+
+struct ClusterAdjacency
+{
+ unsigned int* offsets;
+ unsigned int* clusters;
+ unsigned int* shared;
+};
+
+static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
+{
+ (void)vertex_count;
+ (void)total_index_count;
+
+ size_t cluster_start = 0;
+ size_t cluster_write = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ offsets[i] = unsigned(cluster_write);
+
+ // copy cluster indices, skipping duplicates
+ for (size_t j = 0; j < cluster_index_counts[i]; ++j)
+ {
+ unsigned int v = cluster_indices[cluster_start + j];
+ assert(v < vertex_count);
+
+ data[cluster_write] = v;
+ cluster_write += 1 - used[v];
+ used[v] = 1;
+ }
+
+ // reset used flags for the next cluster
+ for (size_t j = offsets[i]; j < cluster_write; ++j)
+ used[data[j]] = 0;
+
+ cluster_start += cluster_index_counts[i];
+ }
+
+ assert(cluster_start == total_index_count);
+ assert(cluster_write <= total_index_count);
+ offsets[cluster_count] = unsigned(cluster_write);
+}
+
+static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
+{
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ float center[3] = {0, 0, 0};
+
+ // approximate center of the cluster by averaging all vertex positions
+ for (size_t j = 0; j < index_count; ++j)
+ {
+ const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+ center[0] += p[0];
+ center[1] += p[1];
+ center[2] += p[2];
+ }
+
+ // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+ if (index_count)
+ {
+ center[0] /= float(index_count);
+ center[1] /= float(index_count);
+ center[2] /= float(index_count);
+ }
+
+ // compute radius of the bounding sphere for each cluster
+ float radiussq = 0;
+
+ for (size_t j = 0; j < index_count; ++j)
+ {
+ const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+ float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+ radiussq = radiussq < d2 ? d2 : radiussq;
+ }
+
+ memcpy(out_center, center, sizeof(center));
+ return sqrtf(radiussq);
+}
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+ unsigned int* ref_offsets = allocator.allocate(vertex_count + 1);
+
+ // compute number of clusters referenced by each vertex
+ memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ ref_offsets[cluster_indices[j]]++;
+ }
+
+ // compute (worst-case) number of adjacent clusters for each cluster
+ size_t total_adjacency = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ size_t count = 0;
+
+ // worst case is every vertex has a disjoint cluster list
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ count += ref_offsets[cluster_indices[j]] - 1;
+
+ // ... but only every other cluster can be adjacent in the end
+ total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+ }
+
+ // we can now allocate adjacency buffers
+ adjacency.offsets = allocator.allocate(cluster_count + 1);
+ adjacency.clusters = allocator.allocate(total_adjacency);
+ adjacency.shared = allocator.allocate(total_adjacency);
+
+ // convert ref counts to offsets
+ size_t total_refs = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ size_t count = ref_offsets[i];
+ ref_offsets[i] = unsigned(total_refs);
+ total_refs += count;
+ }
+
+ unsigned int* ref_data = allocator.allocate(total_refs);
+
+ // fill cluster refs for each vertex
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
+ }
+
+ // after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+ memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+ ref_offsets[0] = 0;
+
+ // fill cluster adjacency for each cluster...
+ adjacency.offsets[0] = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+ unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+ size_t count = 0;
+
+ for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+ {
+ unsigned int v = cluster_indices[j];
+
+ // merge the entire cluster list of each vertex into current list
+ for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+ {
+ unsigned int c = ref_data[k];
+ assert(c < cluster_count);
+
+ if (c == unsigned(i))
+ continue;
+
+ // if the cluster is already in the list, increment the shared count
+ bool found = false;
+ for (size_t l = 0; l < count; ++l)
+ if (adj[l] == c)
+ {
+ found = true;
+ shd[l]++;
+ break;
+ }
+
+ // .. or append a new cluster
+ if (!found)
+ {
+ adj[count] = c;
+ shd[count] = 1;
+ count++;
+ }
+ }
+ }
+
+ // mark the end of the adjacency list; the next cluster will start there as well
+ adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+ }
+
+ assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+ // ref_offsets can't be deallocated as it was allocated before adjacency
+ allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+ int group;
+ int next;
+ unsigned int size; // 0 unless root
+ unsigned int vertices;
+
+ float center[3];
+ float radius;
+};
+
+struct GroupOrder
+{
+ unsigned int id;
+ int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+ // insert a new element at the end (breaks heap invariant)
+ heap[size++] = item;
+
+ // bubble up the new element to its correct position
+ size_t i = size - 1;
+ while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+ {
+ size_t p = (i - 1) / 2;
+
+ GroupOrder temp = heap[i];
+ heap[i] = heap[p];
+ heap[p] = temp;
+ i = p;
+ }
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+ assert(size > 0);
+ GroupOrder top = heap[0];
+
+ // move the last element to the top (breaks heap invariant)
+ heap[0] = heap[--size];
+
+ // bubble down the new top element to its correct position
+ size_t i = 0;
+ while (i * 2 + 1 < size)
+ {
+ // find the smallest child
+ size_t j = i * 2 + 1;
+ j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+ // if the parent is already smaller than both children, we're done
+ if (heap[j].order >= heap[i].order)
+ break;
+
+ // otherwise, swap the parent and child and continue
+ GroupOrder temp = heap[i];
+ heap[i] = heap[j];
+ heap[j] = temp;
+ i = j;
+ }
+
+ return top;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+ unsigned int total = 0;
+
+ for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+ for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+ {
+ for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+ if (adjacency.clusters[adj] == unsigned(i2))
+ {
+ total += adjacency.shared[adj];
+ break;
+ }
+ }
+
+ return total;
+}
+
+static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
+{
+ float r1 = target.radius, r2 = source.radius;
+ float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+ float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ if (d + r1 < r2)
+ {
+ target.center[0] = source.center[0];
+ target.center[1] = source.center[1];
+ target.center[2] = source.center[2];
+ target.radius = source.radius;
+ return;
+ }
+
+ if (d + r2 > r1)
+ {
+ float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
+
+ target.center[0] += dx * k;
+ target.center[1] += dy * k;
+ target.center[2] += dz * k;
+ target.radius = (d + r2 + r1) / 2;
+ }
+}
+
+static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
+{
+ float r1 = target.radius, r2 = source.radius;
+ float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+ float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+ float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
+
+ return mr > 0 ? r1 / mr : 0.f;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
+{
+ assert(groups[id].size > 0);
+
+ float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+ int best_group = -1;
+ float best_score = 0;
+
+ for (int ci = id; ci >= 0; ci = groups[ci].next)
+ {
+ for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+ {
+ int other = groups[adjacency.clusters[adj]].group;
+ if (other < 0)
+ continue;
+
+ assert(groups[other].size > 0);
+ if (groups[id].size + groups[other].size > max_partition_size)
+ continue;
+
+ unsigned int shared = countShared(groups, id, other, adjacency);
+ float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+ // normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+ float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+ // incorporate spatial score to favor merging nearby groups
+ if (use_bounds)
+ score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
+
+ if (score > best_score)
+ {
+ best_group = other;
+ best_score = score;
+ }
+ }
+ }
+
+ return best_group;
+}
+
+static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
+{
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = order[i];
+ if (groups[id].size == 0 || groups[id].size >= target_partition_size)
+ continue;
+
+ float best_score = -1.f;
+ int best_group = -1;
+
+ for (size_t j = 0; j < count; ++j)
+ {
+ unsigned int other = order[j];
+ if (id == other || groups[other].size == 0)
+ continue;
+
+ if (groups[id].size + groups[other].size > max_partition_size)
+ continue;
+
+ // favor merging nearby groups
+ float score = boundsScore(groups[id], groups[other]);
+
+ if (score > best_score)
+ {
+ best_score = score;
+ best_group = other;
+ }
+ }
+
+ // merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
+ if (best_group != -1)
+ {
+ // combine groups by linking them together
+ unsigned int tail = best_group;
+ while (groups[tail].next >= 0)
+ tail = groups[tail].next;
+
+ groups[tail].next = id;
+
+ // update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
+ groups[best_group].size += groups[id].size;
+ groups[id].size = 0;
+
+ // merge bounding spheres
+ mergeBounds(groups[best_group], groups[id]);
+ groups[id].radius = 0.f;
+ }
+ }
+}
+
+static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
+{
+ size_t m = 0;
+
+ // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+ for (size_t i = 0; i < count; ++i)
+ {
+ float v = groups[order[i]].center[axis];
+
+ // swap(m, i) unconditionally
+ unsigned int t = order[m];
+ order[m] = order[i];
+ order[i] = t;
+
+ // when v >= pivot, we swap i with m without advancing it, preserving invariants
+ m += v < pivot;
+ }
+
+ return m;
+}
+
+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
+{
+ size_t total = 0;
+ for (size_t i = 0; i < count; ++i)
+ total += groups[order[i]].size;
+
+ if (total <= max_partition_size || count <= leaf_size)
+ return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
+
+ float mean[3] = {};
+ float vars[3] = {};
+ float runc = 1, runs = 1;
+
+ // gather statistics on the points in the subtree using Welford's algorithm
+ for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+ {
+ const float* point = groups[order[i]].center;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ float delta = point[k] - mean[k];
+ mean[k] += delta * runs;
+ vars[k] += delta * (point[k] - mean[k]);
+ }
+ }
+
+ // split axis is one where the variance is largest
+ int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+
+ float split = mean[axis];
+ size_t middle = mergePartition(order, count, groups, axis, split);
+
+ // enforce balance for degenerate partitions
+ // this also ensures recursion depth is bounded on pathological inputs
+ if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
+ middle = count / 2;
+
+ // recursion depth is logarithmic and bounded due to max depth check above
+ mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+ mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+ using namespace meshopt;
+
+ assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_partition_size > 0);
+
+ size_t max_partition_size = target_partition_size + target_partition_size / 3;
+
+ meshopt_Allocator allocator;
+
+ unsigned char* used = allocator.allocate(vertex_count);
+ memset(used, 0, vertex_count);
+
+ unsigned int* cluster_newindices = allocator.allocate(total_index_count);
+ unsigned int* cluster_offsets = allocator.allocate(cluster_count + 1);
+
+ // make new cluster index list that filters out duplicate indices
+ filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
+ cluster_indices = cluster_newindices;
+
+ // build cluster adjacency along with edge weights (shared vertex count)
+ ClusterAdjacency adjacency = {};
+ buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
+
+ ClusterGroup* groups = allocator.allocate(cluster_count);
+ memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
+
+ GroupOrder* order = allocator.allocate(cluster_count);
+ size_t pending = 0;
+
+ // create a singleton group for each cluster and order them by priority
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ groups[i].group = int(i);
+ groups[i].next = -1;
+ groups[i].size = 1;
+ groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
+ assert(groups[i].vertices > 0);
+
+ // compute bounding sphere for each cluster if positions are provided
+ if (vertex_positions)
+ groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
+
+ GroupOrder item = {};
+ item.id = unsigned(i);
+ item.order = groups[i].vertices;
+
+ heapPush(order, pending++, item);
+ }
+
+ // iteratively merge the smallest group with the best group
+ while (pending)
+ {
+ GroupOrder top = heapPop(order, pending--);
+
+ // this group was merged into another group earlier
+ if (groups[top.id].size == 0)
+ continue;
+
+ // disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+ for (int i = top.id; i >= 0; i = groups[i].next)
+ {
+ assert(groups[i].group == int(top.id));
+ groups[i].group = -1;
+ }
+
+ // the group is large enough, emit as is
+ if (groups[top.id].size >= target_partition_size)
+ continue;
+
+ int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
+
+ // we can't grow the group any more, emit as is
+ if (best_group == -1)
+ continue;
+
+ // compute shared vertices to adjust the total vertices estimate after merging
+ unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+ // combine groups by linking them together
+ unsigned int tail = top.id;
+ while (groups[tail].next >= 0)
+ tail = groups[tail].next;
+
+ groups[tail].next = best_group;
+
+ // update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
+ groups[top.id].size += groups[best_group].size;
+ groups[top.id].vertices += groups[best_group].vertices;
+ groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+ groups[best_group].size = 0;
+ groups[best_group].vertices = 0;
+
+ // merge bounding spheres if bounds are available
+ if (vertex_positions)
+ {
+ mergeBounds(groups[top.id], groups[best_group]);
+ groups[best_group].radius = 0;
+ }
+
+ // re-associate all clusters back to the merged group
+ for (int i = top.id; i >= 0; i = groups[i].next)
+ groups[i].group = int(top.id);
+
+ top.order = groups[top.id].vertices;
+ heapPush(order, pending++, top);
+ }
+
+ // if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
+ if (vertex_positions)
+ {
+ unsigned int* merge_order = reinterpret_cast(order);
+ size_t merge_offset = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ if (groups[i].size)
+ merge_order[merge_offset++] = unsigned(i);
+
+ mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
+ }
+
+ // output each remaining group
+ size_t next_group = 0;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ if (groups[i].size == 0)
+ continue;
+
+ for (int j = int(i); j >= 0; j = groups[j].next)
+ destination[j] = unsigned(next_group);
+
+ next_group++;
+ }
+
+ assert(next_group <= cluster_count);
+ return next_group;
+}
diff --git a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
similarity index 62%
rename from Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/rasterizer.cpp
index 31cf6f146..bd788ffdb 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
@@ -18,14 +18,6 @@ struct OverdrawBuffer
unsigned int overdraw[kViewport][kViewport][2];
};
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef max
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
{
// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
@@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1,
float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
float invdet = (det == 0) ? 0 : 1 / det;
- dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
- dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+ dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
+ dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;
return det;
}
@@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
// bounding rectangle, clipped against viewport
// since we rasterize pixels with covered centers, min >0.5 should round up
// as for max, due to top-left filling convention we will never rasterize right/bottom edges
- // so max >= 0.5 should round down
- int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
- int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
- int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
- int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+ // so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
+ int minx = X1 < X2 ? X1 : X2;
+ minx = minx < X3 ? minx : X3;
+ minx = (minx + 7) >> 4;
+ minx = minx < 0 ? 0 : minx;
+
+ int miny = Y1 < Y2 ? Y1 : Y2;
+ miny = miny < Y3 ? miny : Y3;
+ miny = (miny + 7) >> 4;
+ miny = miny < 0 ? 0 : miny;
+
+ int maxx = X1 > X2 ? X1 : X2;
+ maxx = maxx > X3 ? maxx : X3;
+ maxx = (maxx + 7) >> 4;
+ maxx = maxx > kViewport ? kViewport : maxx;
+
+ int maxy = Y1 > Y2 ? Y1 : Y2;
+ maxy = maxy > Y3 ? maxy : Y3;
+ maxy = (maxy + 7) >> 4;
+ maxy = maxy > kViewport ? kViewport : maxy;
// deltas, 28.4 fixed point
int DX12 = X1 - X2;
@@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
}
}
-} // namespace meshopt
-
-meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
- using namespace meshopt;
-
- assert(index_count % 3 == 0);
- assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
- assert(vertex_positions_stride % sizeof(float) == 0);
-
- meshopt_Allocator allocator;
-
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
- meshopt_OverdrawStatistics result = {};
-
float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
@@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
for (int j = 0; j < 3; ++j)
{
- minv[j] = min(minv[j], v[j]);
- maxv[j] = max(maxv[j], v[j]);
+ float vj = v[j];
+
+ minv[j] = minv[j] > vj ? vj : minv[j];
+ maxv[j] = maxv[j] < vj ? vj : maxv[j];
}
}
- float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
- float scale = kViewport / extent;
+ float extent = 0.f;
- float* triangles = allocator.allocate(index_count * 3);
+ extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+ extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+ extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+ float scale = kViewport / extent;
for (size_t i = 0; i < index_count; ++i)
{
@@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
}
+ return extent;
+}
+
+static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
+{
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ const float* vn0 = &triangles[3 * (i + 0)];
+ const float* vn1 = &triangles[3 * (i + 1)];
+ const float* vn2 = &triangles[3 * (i + 2)];
+
+ switch (axis)
+ {
+ case 0:
+ rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+ break;
+ case 1:
+ rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+ break;
+ case 2:
+ rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+ break;
+ }
+ }
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ meshopt_OverdrawStatistics result = {};
+
+ float* triangles = allocator.allocate(index_count * 3);
+ transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
OverdrawBuffer* buffer = allocator.allocate(1);
for (int axis = 0; axis < 3; ++axis)
{
memset(buffer, 0, sizeof(OverdrawBuffer));
-
- for (size_t i = 0; i < index_count; i += 3)
- {
- const float* vn0 = &triangles[3 * (i + 0)];
- const float* vn1 = &triangles[3 * (i + 1)];
- const float* vn2 = &triangles[3 * (i + 2)];
-
- switch (axis)
- {
- case 0:
- rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
- break;
- case 1:
- rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
- break;
- case 2:
- rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
- break;
- }
- }
+ rasterizeTriangles(buffer, triangles, index_count, axis);
for (int y = 0; y < kViewport; ++y)
for (int x = 0; x < kViewport; ++x)
@@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
return result;
}
+
+meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ meshopt_CoverageStatistics result = {};
+
+ float* triangles = allocator.allocate(index_count * 3);
+ float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+ OverdrawBuffer* buffer = allocator.allocate(1);
+
+ for (int axis = 0; axis < 3; ++axis)
+ {
+ memset(buffer, 0, sizeof(OverdrawBuffer));
+ rasterizeTriangles(buffer, triangles, index_count, axis);
+
+ unsigned int covered = 0;
+
+ for (int y = 0; y < kViewport; ++y)
+ for (int x = 0; x < kViewport; ++x)
+ covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
+
+ result.coverage[axis] = float(covered) / float(kViewport * kViewport);
+ }
+
+ result.extent = extent;
+
+ return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/simplifier.cpp b/Source/ThirdParty/meshoptimizer/simplifier.cpp
index e59b4afcd..14d4d42fe 100644
--- a/Source/ThirdParty/meshoptimizer/simplifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp
@@ -27,6 +27,7 @@
// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
+// Hugues Hoppe, Steve Marschner. Efficient Minimization of New Quadric Metric for Simplifying Meshes with Appearance Attributes. 2000
namespace meshopt
{
@@ -118,10 +119,17 @@ struct PositionHasher
unsigned int ri = sparse_remap ? sparse_remap[index] : index;
const unsigned int* key = reinterpret_cast(vertex_positions + ri * vertex_stride_float);
+ unsigned int x = key[0], y = key[1], z = key[2];
+
+ // replace negative zero with zero
+ x = (x == 0x80000000) ? 0 : x;
+ y = (y == 0x80000000) ? 0 : y;
+ z = (z == 0x80000000) ? 0 : z;
+
// scramble bits to make sure that integer coordinates have entropy in lower bits
- unsigned int x = key[0] ^ (key[0] >> 17);
- unsigned int y = key[1] ^ (key[1] >> 17);
- unsigned int z = key[2] ^ (key[2] >> 17);
+ x ^= x >> 17;
+ y ^= y >> 17;
+ z ^= z >> 17;
// Optimized Spatial Hashing for Collision Detection of Deformable Objects
return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
@@ -132,7 +140,10 @@ struct PositionHasher
unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs;
unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs;
- return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0;
+ const float* lv = vertex_positions + li * vertex_stride_float;
+ const float* rv = vertex_positions + ri * vertex_stride_float;
+
+ return lv[0] == rv[0] && lv[1] == rv[1] && lv[2] == rv[2];
}
};
@@ -208,6 +219,11 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
remap[index] = *entry;
}
+ allocator.deallocate(table);
+
+ if (!wedge)
+ return;
+
// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
for (size_t i = 0; i < vertex_count; ++i)
@@ -221,22 +237,24 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
wedge[i] = wedge[r];
wedge[r] = unsigned(i);
}
-
- allocator.deallocate(table);
}
static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator)
{
// use a bit set to compute the precise number of unique vertices
unsigned char* filter = allocator.allocate((vertex_count + 7) / 8);
- memset(filter, 0, (vertex_count + 7) / 8);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+ filter[index / 8] = 0;
+ }
size_t unique = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
- assert(index < vertex_count);
-
unique += (filter[index / 8] & (1 << (index % 8))) == 0;
filter[index / 8] |= 1 << (index % 8);
}
@@ -255,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
-
unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
if (*entry == ~0u)
@@ -288,14 +305,14 @@ enum VertexKind
};
// manifold vertices can collapse onto anything
-// border/seam vertices can only be collapsed onto border/seam respectively
+// border/seam vertices can collapse onto border/seam respectively, or locked
// complex vertices can collapse onto complex/locked
// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
{1, 1, 1, 1, 1},
- {0, 1, 0, 0, 0},
- {0, 0, 1, 0, 0},
+ {0, 1, 0, 0, 1},
+ {0, 0, 1, 0, 1},
{0, 0, 0, 1, 1},
{0, 0, 0, 0, 0},
};
@@ -303,11 +320,13 @@ const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
// note that for seam edges, the opposite edge isn't present in the attribute-based topology
// but is present if you consider a position-only mesh variant
+// while many complex collapses have the opposite edge, since complex vertices collapse to the
+// same wedge, keeping opposite edges separate improves the quality by considering both targets
const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
- {1, 1, 1, 0, 1},
+ {1, 1, 1, 1, 1},
{1, 0, 1, 0, 0},
{1, 1, 1, 0, 1},
- {0, 0, 0, 0, 0},
+ {1, 0, 0, 0, 0},
{1, 0, 1, 0, 0},
};
@@ -323,14 +342,33 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int
return false;
}
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b, const unsigned int* remap, const unsigned int* wedge)
+{
+ unsigned int v = a;
+
+ do
+ {
+ unsigned int count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+ const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[v];
+
+ for (size_t i = 0; i < count; ++i)
+ if (remap[edges[i].next] == remap[b])
+ return true;
+
+ v = wedge[v];
+ } while (v != a);
+
+ return false;
+}
+
static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options)
{
memset(loop, -1, vertex_count * sizeof(unsigned int));
memset(loopback, -1, vertex_count * sizeof(unsigned int));
// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
- // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
- // but here it's okay to fill the data out for other types of vertices as well
+ // note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default
+ // in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices
unsigned int* openinc = loopback;
unsigned int* openout = loop;
@@ -369,12 +407,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
{
if (remap[i] == i)
{
- if (vertex_lock && vertex_lock[sparse_remap ? sparse_remap[i] : i])
- {
- // vertex is explicitly locked
- result[i] = Kind_Locked;
- }
- else if (wedge[i] == i)
+ if (wedge[i] == i)
{
// no attribute seam, need to check if it's manifold
unsigned int openi = openinc[i], openo = openout[i];
@@ -386,6 +419,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
{
result[i] = Kind_Manifold;
}
+ else if (openi != ~0u && openo != ~0u && remap[openi] == remap[openo] && openi != i)
+ {
+ // classify half-seams as seams (the branch below would mis-classify them as borders)
+ // half-seam is a single vertex that connects to both vertices of a potential seam
+ // treating these as seams allows collapsing the "full" seam vertex onto them
+ result[i] = Kind_Seam;
+ }
else if (openi != i && openo != i)
{
result[i] = Kind_Border;
@@ -407,7 +447,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
openiw != ~0u && openiw != w && openow != ~0u && openow != w)
{
- if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+ if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw] && remap[openiv] != remap[openov])
{
result[i] = Kind_Seam;
}
@@ -438,6 +478,58 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
}
}
+ if (options & meshopt_SimplifyPermissive)
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (result[i] == Kind_Seam || result[i] == Kind_Locked)
+ {
+ if (remap[i] != i)
+ {
+ // only process primary vertices; wedges will be updated to match the primary vertex
+ result[i] = result[remap[i]];
+ continue;
+ }
+
+ bool protect = false;
+
+ // vertex_lock may protect any wedge, not just the primary vertex, so we switch to complex only if no wedges are protected
+ unsigned int v = unsigned(i);
+ do
+ {
+ unsigned int rv = sparse_remap ? sparse_remap[v] : v;
+ protect |= vertex_lock && (vertex_lock[rv] & meshopt_SimplifyVertex_Protect) != 0;
+ v = wedge[v];
+ } while (v != i);
+
+ // protect if any adjoining edge doesn't have an opposite edge (indicating vertex is on the border)
+ do
+ {
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[v]];
+ size_t count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+
+ for (size_t j = 0; j < count; ++j)
+ protect |= !hasEdge(adjacency, edges[j].next, v, remap, wedge);
+ v = wedge[v];
+ } while (v != i);
+
+ result[i] = protect ? result[i] : int(Kind_Complex);
+ }
+
+ if (vertex_lock)
+ {
+ // vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+ if (vertex_lock[ri] & meshopt_SimplifyVertex_Lock)
+ result[remap[i]] = Kind_Locked;
+ }
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (result[remap[i]] == Kind_Locked)
+ result[i] = Kind_Locked;
+ }
+
if (options & meshopt_SimplifyLockBorder)
for (size_t i = 0; i < vertex_count; ++i)
if (result[i] == Kind_Border)
@@ -454,7 +546,7 @@ struct Vector3
float x, y, z;
};
-static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL)
+static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL, float* out_offset = NULL)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
@@ -500,10 +592,17 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat
}
}
+ if (out_offset)
+ {
+ out_offset[0] = minv[0];
+ out_offset[1] = minv[1];
+ out_offset[2] = minv[2];
+ }
+
return extent;
}
-static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* sparse_remap)
+static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* attribute_remap, const unsigned int* sparse_remap)
{
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
@@ -513,18 +612,61 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
for (size_t k = 0; k < attribute_count; ++k)
{
- float a = vertex_attributes_data[ri * vertex_attributes_stride_float + k];
+ unsigned int rk = attribute_remap[k];
+ float a = vertex_attributes_data[ri * vertex_attributes_stride_float + rk];
- result[i * attribute_count + k] = a * attribute_weights[k];
+ result[i * attribute_count + k] = a * attribute_weights[rk];
}
}
}
-static const size_t kMaxAttributes = 16;
+static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
+{
+ size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
+ size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+ // updating externally locked vertices is not allowed
+ if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
+ continue;
+
+ // moving locked vertices may result in floating point drift
+ if (vertex_kind[i] != Kind_Locked)
+ {
+ const Vector3& p = vertex_positions[i];
+ float* v = vertex_positions_data + ri * vertex_positions_stride_float;
+
+ v[0] = p.x * vertex_scale + vertex_offset[0];
+ v[1] = p.y * vertex_scale + vertex_offset[1];
+ v[2] = p.z * vertex_scale + vertex_offset[2];
+ }
+
+ if (attribute_count)
+ {
+ const float* sa = vertex_attributes + i * attribute_count;
+ float* va = vertex_attributes_data + ri * vertex_attributes_stride_float;
+
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ unsigned int rk = attribute_remap[k];
+
+ va[rk] = sa[k] / attribute_weights[rk];
+ }
+ }
+ }
+}
+
+static const size_t kMaxAttributes = 32;
struct Quadric
{
- // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c
+ // a00*x^2 + a11*y^2 + a22*z^2 + 2*a10*xy + 2*a20*xz + 2*a21*yz + 2*b0*x + 2*b1*y + 2*b2*z + c
float a00, a11, a22;
float a10, a20, a21;
float b0, b1, b2, c;
@@ -586,6 +728,14 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
Q.w += R.w;
}
+static void quadricAdd(QuadricGrad& G, const QuadricGrad& R)
+{
+ G.gx += R.gx;
+ G.gy += R.gy;
+ G.gz += R.gz;
+ G.gw += R.gw;
+}
+
static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count)
{
for (size_t k = 0; k < attribute_count; ++k)
@@ -597,7 +747,7 @@ static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_co
}
}
-static float quadricError(const Quadric& Q, const Vector3& v)
+static float quadricEval(const Quadric& Q, const Vector3& v)
{
float rx = Q.b0;
float ry = Q.b1;
@@ -620,6 +770,12 @@ static float quadricError(const Quadric& Q, const Vector3& v)
r += ry * v.y;
r += rz * v.z;
+ return r;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+ float r = quadricEval(Q, v);
float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
return fabsf(r) * s;
@@ -627,26 +783,7 @@ static float quadricError(const Quadric& Q, const Vector3& v)
static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va)
{
- float rx = Q.b0;
- float ry = Q.b1;
- float rz = Q.b2;
-
- rx += Q.a10 * v.y;
- ry += Q.a21 * v.z;
- rz += Q.a20 * v.x;
-
- rx *= 2;
- ry *= 2;
- rz *= 2;
-
- rx += Q.a00 * v.x;
- ry += Q.a11 * v.y;
- rz += Q.a22 * v.z;
-
- float r = Q.c;
- r += rx * v.x;
- r += ry * v.y;
- r += rz * v.z;
+ float r = quadricEval(Q, v);
// see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
for (size_t k = 0; k < attribute_count; ++k)
@@ -654,14 +791,11 @@ static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribu
float a = va[k];
float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw;
- r += a * a * Q.w;
- r -= 2 * a * g;
+ r += a * (a * Q.w - 2 * g);
}
- // TODO: weight normalization is breaking attribute error somehow
- float s = 1; // Q.w == 0.f ? 0.f : 1.f / Q.w;
-
- return fabsf(r) * s;
+ // note: unlike position error, we do not normalize by Q.w to retain edge scaling as described in quadricFromAttributes
+ return fabsf(r);
}
static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
@@ -684,6 +818,17 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
Q.w = w;
}
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+ Q.a00 = Q.a11 = Q.a22 = w;
+ Q.a10 = Q.a20 = Q.a21 = 0;
+ Q.b0 = -x * w;
+ Q.b1 = -y * w;
+ Q.b2 = -z * w;
+ Q.c = (x * x + y * y + z * z) * w;
+ Q.w = w;
+}
+
static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
{
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
@@ -702,20 +847,24 @@ static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1
static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
{
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
- float length = normalize(p10);
- // p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+ // edge length; keep squared length around for projection correction
+ float lengthsq = p10.x * p10.x + p10.y * p10.y + p10.z * p10.z;
+ float length = sqrtf(lengthsq);
+
+ // p20p = length of projection of p2-p0 onto p1-p0; note that p10 is unnormalized so we need to correct it later
Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
- // normal = altitude of triangle from point p2 onto edge p1-p0
- Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
- normalize(normal);
+ // perp = perpendicular vector from p2 to line segment p1-p0
+ // note: since p10 is unnormalized we need to correct the projection; we scale p20 instead to take advantage of normalize below
+ Vector3 perp = {p20.x * lengthsq - p10.x * p20p, p20.y * lengthsq - p10.y * p20p, p20.z * lengthsq - p10.z * p20p};
+ normalize(perp);
- float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+ float distance = perp.x * p0.x + perp.y * p0.y + perp.z * p0.z;
// note: the weight is scaled linearly with edge length; this has to match the triangle weight
- quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+ quadricFromPlane(Q, perp.x, perp.y, perp.z, -distance, length * weight);
}
static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count)
@@ -728,16 +877,21 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
- // weight is scaled linearly with edge length
+ // normal = cross(p1 - p0, p2 - p0)
Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
- float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
- float w = sqrtf(area); // TODO this needs more experimentation
+ float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z) * 0.5f;
+
+ // quadric is weighted with the square of edge length (= area)
+ // this equalizes the units with the positional error (which, after normalization, is a square of distance)
+ // as a result, a change in weighted attribute of 1 along distance d is approximately equivalent to a change in position of d
+ float w = area;
// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
// v = (d11 * d20 - d01 * d21) / denom
// w = (d00 * d21 - d01 * d20) / denom
// u = 1 - v - w
// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+ // note: v2 and d20/d21 can not be evaluated here as v2 is effectively an unknown variable; we need these only as variables for derivation of gradients
const Vector3& v0 = p10;
const Vector3& v1 = p20;
float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
@@ -747,7 +901,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
float denomr = denom == 0 ? 0.f : 1.f / denom;
// precompute gradient factors
- // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+ // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out expressions that are shared between attributes
float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
@@ -772,6 +926,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+ // note: for simplicity we scale all factors by weight here instead of outside the loop
Q.a00 += w * (gx * gx);
Q.a11 += w * (gy * gy);
Q.a22 += w * (gz * gz);
@@ -794,7 +949,112 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
}
}
-static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+static void quadricVolumeGradient(QuadricGrad& G, const Vector3& p0, const Vector3& p1, const Vector3& p2)
+{
+ Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+ Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+ // normal = cross(p1 - p0, p2 - p0)
+ Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+ float area = normalize(normal) * 0.5f;
+
+ G.gx = normal.x * area;
+ G.gy = normal.y * area;
+ G.gz = normal.z * area;
+ G.gw = (-p0.x * normal.x - p0.y * normal.y - p0.z * normal.z) * area;
+}
+
+static bool quadricSolve(Vector3& p, const Quadric& Q, const QuadricGrad& GV)
+{
+ // solve A*p = -b where A is the quadric matrix and b is the linear term
+ float a00 = Q.a00, a11 = Q.a11, a22 = Q.a22;
+ float a10 = Q.a10, a20 = Q.a20, a21 = Q.a21;
+ float x0 = -Q.b0, x1 = -Q.b1, x2 = -Q.b2;
+
+ float eps = 1e-6f * Q.w;
+
+ // LDL decomposition: A = LDL^T
+ float d0 = a00;
+ float l10 = a10 / d0;
+ float l20 = a20 / d0;
+
+ float d1 = a11 - a10 * l10;
+ float dl21 = a21 - a20 * l10;
+ float l21 = dl21 / d1;
+
+ float d2 = a22 - a20 * l20 - dl21 * l21;
+
+ // solve L*y = x
+ float y0 = x0;
+ float y1 = x1 - l10 * y0;
+ float y2 = x2 - l20 * y0 - l21 * y1;
+
+ // solve D*z = y
+ float z0 = y0 / d0;
+ float z1 = y1 / d1;
+ float z2 = y2 / d2;
+
+ // augment system with linear constraint GV using Lagrange multiplier
+ float a30 = GV.gx, a31 = GV.gy, a32 = GV.gz;
+ float x3 = -GV.gw;
+
+ float l30 = a30 / d0;
+ float dl31 = a31 - a30 * l10;
+ float l31 = dl31 / d1;
+ float dl32 = a32 - a30 * l20 - dl31 * l21;
+ float l32 = dl32 / d2;
+ float d3 = 0.f - a30 * l30 - dl31 * l31 - dl32 * l32;
+
+ float y3 = x3 - l30 * y0 - l31 * y1 - l32 * y2;
+ float z3 = fabsf(d3) > eps ? y3 / d3 : 0.f; // if d3 is zero, we can ignore the constraint
+
+ // substitute L^T*p = z
+ float lambda = z3;
+ float pz = z2 - l32 * lambda;
+ float py = z1 - l21 * pz - l31 * lambda;
+ float px = z0 - l10 * py - l20 * pz - l30 * lambda;
+
+ p.x = px;
+ p.y = py;
+ p.z = pz;
+
+ return fabsf(d0) > eps && fabsf(d1) > eps && fabsf(d2) > eps;
+}
+
+static void quadricReduceAttributes(Quadric& Q, const Quadric& A, const QuadricGrad* G, size_t attribute_count)
+{
+ // update vertex quadric with attribute quadric; multiply by vertex weight to minimize normalized error
+ Q.a00 += A.a00 * Q.w;
+ Q.a11 += A.a11 * Q.w;
+ Q.a22 += A.a22 * Q.w;
+ Q.a10 += A.a10 * Q.w;
+ Q.a20 += A.a20 * Q.w;
+ Q.a21 += A.a21 * Q.w;
+ Q.b0 += A.b0 * Q.w;
+ Q.b1 += A.b1 * Q.w;
+ Q.b2 += A.b2 * Q.w;
+
+ float iaw = A.w == 0 ? 0.f : Q.w / A.w;
+
+ // update linear system based on attribute gradients (BB^T/a)
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ const QuadricGrad& g = G[k];
+
+ Q.a00 -= (g.gx * g.gx) * iaw;
+ Q.a11 -= (g.gy * g.gy) * iaw;
+ Q.a22 -= (g.gz * g.gz) * iaw;
+ Q.a10 -= (g.gx * g.gy) * iaw;
+ Q.a20 -= (g.gx * g.gz) * iaw;
+ Q.a21 -= (g.gy * g.gz) * iaw;
+
+ Q.b0 -= (g.gx * g.gw) * iaw;
+ Q.b1 -= (g.gy * g.gw) * iaw;
+ Q.b2 -= (g.gz * g.gw) * iaw;
+ }
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, QuadricGrad* volume_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
{
for (size_t i = 0; i < index_count; i += 3)
{
@@ -808,6 +1068,36 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
quadricAdd(vertex_quadrics[remap[i0]], Q);
quadricAdd(vertex_quadrics[remap[i1]], Q);
quadricAdd(vertex_quadrics[remap[i2]], Q);
+
+ if (volume_gradients)
+ {
+ QuadricGrad GV;
+ quadricVolumeGradient(GV, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2]);
+
+ quadricAdd(volume_gradients[remap[i0]], GV);
+ quadricAdd(volume_gradients[remap[i1]], GV);
+ quadricAdd(volume_gradients[remap[i2]], GV);
+ }
+ }
+}
+
+static void fillVertexQuadrics(Quadric* vertex_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* remap, unsigned int options)
+{
+ // by default, we use a very small weight to improve triangulation and numerical stability without affecting the shape or error
+ float factor = (options & meshopt_SimplifyRegularize) ? 1e-1f : 1e-7f;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] != i)
+ continue;
+
+ const Vector3& p = vertex_positions[i];
+ float w = vertex_quadrics[i].w * factor;
+
+ Quadric Q;
+ quadricFromPoint(Q, p.x, p.y, p.z, w);
+
+ quadricAdd(vertex_quadrics[i], Q);
}
}
@@ -837,15 +1127,11 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
continue;
- // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
- if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
- continue;
-
unsigned int i2 = indices[i + next[e + 1]];
// we try hard to maintain border edge geometry; seam edges can move more freely
// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
- const float kEdgeWeightSeam = 1.f;
+ const float kEdgeWeightSeam = 0.5f; // applied twice due to opposite edges
const float kEdgeWeightBorder = 10.f;
float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
@@ -853,13 +1139,20 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
Quadric Q;
quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+ Quadric QT;
+ quadricFromTriangle(QT, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+ // mix edge quadric with triangle quadric to stabilize collapses in both directions; both quadrics inherit edge weight so that their error is added
+ QT.w = 0;
+ quadricAdd(Q, QT);
+
quadricAdd(vertex_quadrics[remap[i0]], Q);
quadricAdd(vertex_quadrics[remap[i1]], Q);
}
}
}
-static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap)
+static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count)
{
for (size_t i = 0; i < index_count; i += 3)
{
@@ -871,14 +1164,13 @@ static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attr
QuadricGrad G[kMaxAttributes];
quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count);
- // TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea
- quadricAdd(attribute_quadrics[remap[i0]], QA);
- quadricAdd(attribute_quadrics[remap[i1]], QA);
- quadricAdd(attribute_quadrics[remap[i2]], QA);
+ quadricAdd(attribute_quadrics[i0], QA);
+ quadricAdd(attribute_quadrics[i1], QA);
+ quadricAdd(attribute_quadrics[i2], QA);
- quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count);
- quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count);
- quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i0 * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i1 * attribute_count], G, attribute_count);
+ quadricAdd(&attribute_gradients[i2 * attribute_count], G, attribute_count);
}
}
@@ -922,6 +1214,30 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
continue;
// early-out when at least one triangle flips due to a collapse
+ if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
+ {
+#if TRACE >= 2
+ printf("edge block %d -> %d: flip welded %d %d %d\n", i0, i1, a, i0, b);
+#endif
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0, const Vector3& v1)
+{
+ const Vector3& v0 = vertex_positions[i0];
+
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+ size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int a = edges[i].next, b = edges[i].prev;
+
if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
return true;
}
@@ -929,6 +1245,46 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
return false;
}
+static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0)
+{
+ const Vector3& v0 = vertex_positions[i0];
+
+ const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+ size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+ float result = 0.f;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int a = edges[i].next, b = edges[i].prev;
+
+ const Vector3& va = vertex_positions[a];
+ const Vector3& vb = vertex_positions[b];
+
+ float da = (va.x - v0.x) * (va.x - v0.x) + (va.y - v0.y) * (va.y - v0.y) + (va.z - v0.z) * (va.z - v0.z);
+ float db = (vb.x - v0.x) * (vb.x - v0.x) + (vb.y - v0.y) * (vb.y - v0.y) + (vb.z - v0.z) * (vb.z - v0.z);
+
+ result = result < da ? da : result;
+ result = result < db ? db : result;
+ }
+
+ return sqrtf(result);
+}
+
+static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback)
+{
+ unsigned int r = remap[target];
+
+ // use loop metadata to guide complex collapses towards the correct wedge
+ // this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams
+ if (loop[v] != ~0u && remap[loop[v]] == r)
+ return loop[v];
+ else if (loopback[v] != ~0u && remap[loopback[v]] == r)
+ return loopback[v];
+ else
+ return target;
+}
+
static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
{
size_t dual_count = 0;
@@ -947,7 +1303,7 @@ static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_c
return (index_count - dual_count / 2) + 3;
}
-static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
{
size_t collapse_count = 0;
@@ -983,8 +1339,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
// two vertices are on a border or a seam, but there's no direct edge between them
// this indicates that they belong to two different edge loops and we should not collapse this edge
- // loop[] tracks half edges so we only need to check i0->i1
- if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+ // loop[] and loopback[] track half edges so we only need to check one of them
+ if ((k0 == Kind_Border || k0 == Kind_Seam) && k1 != Kind_Manifold && loop[i0] != i1)
+ continue;
+ if ((k1 == Kind_Border || k1 == Kind_Seam) && k0 != Kind_Manifold && loopback[i1] != i0)
continue;
// edge can be collapsed in either direction - we will pick the one with minimum error
@@ -1009,7 +1367,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
return collapse_count;
}
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
{
for (size_t i = 0; i < collapse_count; ++i)
{
@@ -1017,40 +1375,94 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
unsigned int i0 = c.v0;
unsigned int i1 = c.v1;
-
- // most edges are bidirectional which means we need to evaluate errors for two collapses
- // to keep this code branchless we just use the same edge for unidirectional edges
- unsigned int j0 = c.bidi ? i1 : i0;
- unsigned int j1 = c.bidi ? i0 : i1;
+ bool bidi = c.bidi;
float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
- float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+ float ej = bidi ? quadricError(vertex_quadrics[remap[i1]], vertex_positions[i0]) : FLT_MAX;
+
+#if TRACE >= 3
+ float di = ei, dj = ej;
+#endif
if (attribute_count)
{
- ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
- ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+ ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
+ ej += bidi ? quadricError(attribute_quadrics[i1], &attribute_gradients[i1 * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]) : 0;
+
+ // seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate
+ if (vertex_kind[i0] == Kind_Seam)
+ {
+ // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+ unsigned int s0 = wedge[i0];
+ unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+
+ assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+ assert(s1 != ~0u && remap[s1] == remap[i1]);
+
+ // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+ s1 = (s1 != ~0u) ? s1 : wedge[i1];
+
+ ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]);
+ ej += bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0;
+ }
+ else
+ {
+ // complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target
+ if (vertex_kind[i0] == Kind_Complex)
+ for (unsigned int v = wedge[i0]; v != i0; v = wedge[v])
+ {
+ unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+ ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+ }
+
+ if (vertex_kind[i1] == Kind_Complex && bidi)
+ for (unsigned int v = wedge[i1]; v != i1; v = wedge[v])
+ {
+ unsigned int t = getComplexTarget(v, i0, remap, loop, loopback);
+
+ ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+ }
+ }
}
- // pick edge direction with minimal error
- c.v0 = ei <= ej ? i0 : j0;
- c.v1 = ei <= ej ? i1 : j1;
- c.error = ei <= ej ? ei : ej;
+ // pick edge direction with minimal error (branchless)
+ bool rev = bidi & (ej < ei);
+
+ c.v0 = rev ? i1 : i0;
+ c.v1 = rev ? i0 : i1;
+ c.error = ej < ei ? ej : ei;
+
+#if TRACE >= 3
+ if (bidi)
+ printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n",
+ rev ? i1 : i0, rev ? i0 : i1,
+ sqrtf(rev ? ej : ei), sqrtf(rev ? dj : di), sqrtf(rev ? ej - dj : ei - di),
+ sqrtf(rev ? ei : ej), sqrtf(rev ? di : dj), sqrtf(rev ? ei - di : ej - dj));
+ else
+ printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", i0, i1, sqrtf(c.error), sqrtf(di), sqrtf(ei - di));
+#endif
}
}
static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
{
- const int sort_bits = 11;
+ // we use counting sort to order collapses by error; since the exact sort order is not as critical,
+ // only top 12 bits of exponent+mantissa (8 bits of exponent and 4 bits of mantissa) are used.
+ // to avoid excessive stack usage, we clamp the exponent range as collapses with errors much higher than 1 are not useful.
+ const unsigned int sort_bits = 12;
+ const unsigned int sort_bins = 2048 + 512; // exponent range [-127, 32)
// fill histogram for counting sort
- unsigned int histogram[1 << sort_bits];
+ unsigned int histogram[sort_bins];
memset(histogram, 0, sizeof(histogram));
for (size_t i = 0; i < collapse_count; ++i)
{
// skip sign bit since error is non-negative
- unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+ unsigned int error = collapses[i].errorui;
+ unsigned int key = (error << 1) >> (32 - sort_bits);
+ key = key < sort_bins ? key : sort_bins - 1;
histogram[key]++;
}
@@ -1058,7 +1470,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
// compute offsets based on histogram data
size_t histogram_sum = 0;
- for (size_t i = 0; i < 1 << sort_bits; ++i)
+ for (size_t i = 0; i < sort_bins; ++i)
{
size_t count = histogram[i];
histogram[i] = unsigned(histogram_sum);
@@ -1071,13 +1483,15 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
for (size_t i = 0; i < collapse_count; ++i)
{
// skip sign bit since error is non-negative
- unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+ unsigned int error = collapses[i].errorui;
+ unsigned int key = (error << 1) >> (32 - sort_bits);
+ key = key < sort_bins ? key : sort_bins - 1;
sort_order[histogram[key]++] = unsigned(i);
}
}
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
{
size_t edge_collapses = 0;
size_t triangle_collapses = 0;
@@ -1087,7 +1501,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
size_t edge_collapse_goal = triangle_collapse_goal / 2;
#if TRACE
- size_t stats[4] = {};
+ size_t stats[7] = {};
#endif
for (size_t i = 0; i < collapse_count; ++i)
@@ -1097,10 +1511,16 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
TRACESTATS(0);
if (c.error > error_limit)
+ {
+ TRACESTATS(4);
break;
+ }
if (triangle_collapses >= triangle_collapse_goal)
+ {
+ TRACESTATS(5);
break;
+ }
// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
// as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor
@@ -1108,8 +1528,11 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
// on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd
// topology, we only abort if we got over 1/6 collapses accordingly.
- if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 6)
+ if (c.error > error_goal && c.error > result_error && triangle_collapses > triangle_collapse_goal / 6)
+ {
+ TRACESTATS(6);
break;
+ }
unsigned int i0 = c.v0;
unsigned int i1 = c.v1;
@@ -1117,6 +1540,8 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
unsigned int r0 = remap[i0];
unsigned int r1 = remap[i1];
+ unsigned char kind = vertex_kind[i0];
+
// we don't collapse vertices that had source or target vertex involved in a collapse
// it's important to not move the vertices twice since it complicates the tracking/remapping logic
// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
@@ -1135,35 +1560,41 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
continue;
}
+#if TRACE >= 2
+ printf("edge commit %d -> %d: kind %d->%d, error %f\n", i0, i1, vertex_kind[i0], vertex_kind[i1], sqrtf(c.error));
+#endif
+
assert(collapse_remap[r0] == r0);
assert(collapse_remap[r1] == r1);
- quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
-
- if (attribute_count)
- {
- quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]);
- quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count);
- }
-
- if (vertex_kind[i0] == Kind_Complex)
+ if (kind == Kind_Complex)
{
+ // remap all vertices in the complex to the target vertex
unsigned int v = i0;
do
{
- collapse_remap[v] = r1;
+ unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+ collapse_remap[v] = t;
v = wedge[v];
} while (v != i0);
}
- else if (vertex_kind[i0] == Kind_Seam)
+ else if (kind == Kind_Seam)
{
- // remap v0 to v1 and seam pair of v0 to seam pair of v1
+ // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
unsigned int s0 = wedge[i0];
- unsigned int s1 = wedge[i1];
+ unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+ assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+ assert(s1 != ~0u && remap[s1] == r1);
- assert(s0 != i0 && s1 != i1);
- assert(wedge[s0] == i0 && wedge[s1] == i1);
+ // additional asserts to verify that the seam pair is consistent
+ assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
+ assert(loop[i0] == i1 || loopback[i0] == i1);
+ assert(loop[s0] == s1 || loopback[s0] == s1);
+
+ // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+ s1 = (s1 != ~0u) ? s1 : wedge[i1];
collapse_remap[i0] = i1;
collapse_remap[s0] = s1;
@@ -1175,28 +1606,205 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
collapse_remap[i0] = i1;
}
+ // note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used
+ // however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling
collapse_locked[r0] = 1;
collapse_locked[r1] = 1;
// border edges collapse 1 triangle, other edges collapse 2 or more
- triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+ triangle_collapses += (kind == Kind_Border) ? 1 : 2;
edge_collapses++;
result_error = result_error < c.error ? c.error : result_error;
}
#if TRACE
- float error_goal_perfect = edge_collapse_goal < collapse_count ? collapses[collapse_order[edge_collapse_goal]].error : 0.f;
+ float error_goal_last = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX;
+ float error_goal_limit = error_goal_last < error_limit ? error_goal_last : error_limit;
- printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d)\n",
- int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_perfect),
- int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]));
+ printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d); %s\n",
+ int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_limit),
+ int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]),
+ stats[4] ? "error limit" : (stats[5] ? "count limit" : (stats[6] ? "error goal" : "out of collapses")));
#endif
return edge_collapses;
}
-static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, QuadricGrad* volume_gradients, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (collapse_remap[i] == i)
+ continue;
+
+ unsigned int i0 = unsigned(i);
+ unsigned int i1 = collapse_remap[i];
+
+ unsigned int r0 = remap[i0];
+ unsigned int r1 = remap[i1];
+
+ // ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved
+ if (i0 == r0)
+ {
+ quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+ if (volume_gradients)
+ quadricAdd(volume_gradients[r1], volume_gradients[r0]);
+ }
+
+ if (attribute_count)
+ {
+ quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
+ quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
+
+ if (i0 == r0)
+ {
+ // when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment
+ float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]);
+ vertex_error = vertex_error < derr ? derr : vertex_error;
+ }
+ }
+ }
+}
+
+static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+#if TRACE
+ size_t stats[6] = {};
+#endif
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
+ // moving vertices on a border requires a stronger edge quadric to preserve the border geometry
+ if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
+ continue;
+
+ if (remap[i] != i)
+ {
+ vertex_positions[i] = vertex_positions[remap[i]];
+ continue;
+ }
+
+ TRACESTATS(0);
+
+ const Vector3& vp = vertex_positions[i];
+
+ Quadric Q = vertex_quadrics[i];
+ QuadricGrad GV = {};
+
+ // add a point quadric for regularization to stabilize the solution
+ Quadric R;
+ quadricFromPoint(R, vp.x, vp.y, vp.z, Q.w * 1e-4f);
+ quadricAdd(Q, R);
+
+ if (attribute_count)
+ {
+ // optimal point simultaneously minimizes attribute quadrics for all wedges
+ unsigned int v = unsigned(i);
+ do
+ {
+ quadricReduceAttributes(Q, attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count);
+ v = wedge[v];
+ } while (v != i);
+
+ // minimizing attribute quadrics results in volume loss so we incorporate volume gradient as a constraint
+ if (volume_gradients)
+ GV = volume_gradients[i];
+ }
+
+ Vector3 p;
+ if (!quadricSolve(p, Q, GV))
+ {
+ TRACESTATS(2);
+ continue;
+ }
+
+ // reject updates that move the vertex too far from its neighborhood
+ // this detects and fixes most cases when the quadric is not well-defined
+ float nr = getNeighborhoodRadius(adjacency, vertex_positions, unsigned(i));
+ float dp = (p.x - vp.x) * (p.x - vp.x) + (p.y - vp.y) * (p.y - vp.y) + (p.z - vp.z) * (p.z - vp.z);
+
+ if (dp > nr * nr)
+ {
+ TRACESTATS(3);
+ continue;
+ }
+
+ // reject updates that would flip a neighboring triangle, as we do for edge collapse
+ if (hasTriangleFlips(adjacency, vertex_positions, unsigned(i), p))
+ {
+ TRACESTATS(4);
+ continue;
+ }
+
+ // reject updates that increase positional error too much; allow some tolerance to improve attribute quality
+ if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
+ {
+ TRACESTATS(5);
+ continue;
+ }
+
+ TRACESTATS(1);
+ vertex_positions[i] = p;
+ }
+
+#if TRACE
+ printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
+#endif
+}
+
+static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (!vertex_update[i])
+ continue;
+
+ if (remap[i] != i)
+ continue;
+
+ for (size_t k = 0; k < attribute_count; ++k)
+ {
+ unsigned int shared = ~0u;
+
+ // for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
+ if (vertex_kind[i] == Kind_Complex)
+ {
+ shared = unsigned(i);
+
+ for (unsigned int v = wedge[i]; v != i; v = wedge[v])
+ if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
+ shared = ~0u;
+ else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
+ shared = v;
+ }
+
+ // update attributes for all wedges
+ unsigned int v = unsigned(i);
+ do
+ {
+ unsigned int r = (shared == ~0u) ? v : shared;
+
+ const Vector3& p = vertex_positions[i]; // same for all wedges
+ const Quadric& A = attribute_quadrics[r];
+ const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
+
+ float iw = A.w == 0 ? 0.f : 1.f / A.w;
+ float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
+
+ vertex_attributes[v * attribute_count + k] = av;
+ v = wedge[v];
+ } while (v != i);
+ }
+ }
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap, const unsigned int* remap)
{
size_t write = 0;
@@ -1211,7 +1819,14 @@ static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const
assert(collapse_remap[v1] == v1);
assert(collapse_remap[v2] == v2);
- if (v0 != v1 && v0 != v2 && v1 != v2)
+ // collapse zero area triangles even if they are not topologically degenerate
+ // this is required to cleanup manifold->seam collapses when a vertex is collapsed onto a seam pair
+ // as well as complex collapses and some other cases where cross wedge collapses are performed
+ unsigned int r0 = remap[v0];
+ unsigned int r1 = remap[v1];
+ unsigned int r2 = remap[v2];
+
+ if (r0 != r1 && r0 != r2 && r1 != r2)
{
indices[write + 0] = v0;
indices[write + 1] = v1;
@@ -1227,17 +1842,183 @@ static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsign
{
for (size_t i = 0; i < vertex_count; ++i)
{
+ // note: this is a no-op for vertices that were remapped
+ // ideally we would clear the loop entries for those for consistency, even though they aren't going to be used
+ // however, the remapping process needs loop information for remapped vertices, so this would require a separate pass
if (loop[i] != ~0u)
{
unsigned int l = loop[i];
unsigned int r = collapse_remap[l];
// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
- loop[i] = (i == r) ? loop[l] : r;
+ if (i == r)
+ loop[i] = (loop[l] != ~0u) ? collapse_remap[loop[l]] : ~0u;
+ else
+ loop[i] = r;
}
}
}
+static unsigned int follow(unsigned int* parents, unsigned int index)
+{
+ while (index != parents[index])
+ {
+ unsigned int parent = parents[index];
+ parents[index] = parents[parent];
+ index = parent;
+ }
+
+ return index;
+}
+
+static size_t buildComponents(unsigned int* components, size_t vertex_count, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ components[i] = unsigned(i);
+
+ // compute a unique (but not sequential!) index for each component via union-find
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ static const int next[4] = {1, 2, 0, 1};
+
+ for (int e = 0; e < 3; ++e)
+ {
+ unsigned int i0 = indices[i + e];
+ unsigned int i1 = indices[i + next[e]];
+
+ unsigned int r0 = remap[i0];
+ unsigned int r1 = remap[i1];
+
+ r0 = follow(components, r0);
+ r1 = follow(components, r1);
+
+ // merge components with larger indices into components with smaller indices
+ // this guarantees that the root of the component is always the one with the smallest index
+ if (r0 != r1)
+ components[r0 < r1 ? r1 : r0] = r0 < r1 ? r0 : r1;
+ }
+ }
+
+ // make sure each element points to the component root *before* we renumber the components
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (remap[i] == i)
+ components[i] = follow(components, unsigned(i));
+
+ unsigned int next_component = 0;
+
+ // renumber components using sequential indices
+ // a sequential pass is sufficient because component root always has the smallest index
+ // note: it is unsafe to use follow() in this pass because we're replacing component links with sequential indices inplace
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] == i)
+ {
+ unsigned int root = components[i];
+ assert(root <= i); // make sure we already computed the component for non-roots
+ components[i] = (root == i) ? next_component++ : components[root];
+ }
+ else
+ {
+ assert(remap[i] < i); // make sure we already computed the component
+ components[i] = components[remap[i]];
+ }
+ }
+
+ return next_component;
+}
+
+static void measureComponents(float* component_errors, size_t component_count, const unsigned int* components, const Vector3* vertex_positions, size_t vertex_count)
+{
+ memset(component_errors, 0, component_count * 4 * sizeof(float));
+
+ // compute approximate sphere center for each component as an average
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int c = components[i];
+ assert(components[i] < component_count);
+
+ Vector3 v = vertex_positions[i]; // copy avoids aliasing issues
+
+ component_errors[c * 4 + 0] += v.x;
+ component_errors[c * 4 + 1] += v.y;
+ component_errors[c * 4 + 2] += v.z;
+ component_errors[c * 4 + 3] += 1; // weight
+ }
+
+ // complete the center computation, and reinitialize [3] as a radius
+ for (size_t i = 0; i < component_count; ++i)
+ {
+ float w = component_errors[i * 4 + 3];
+ float iw = w == 0.f ? 0.f : 1.f / w;
+
+ component_errors[i * 4 + 0] *= iw;
+ component_errors[i * 4 + 1] *= iw;
+ component_errors[i * 4 + 2] *= iw;
+ component_errors[i * 4 + 3] = 0; // radius
+ }
+
+ // compute squared radius for each component
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int c = components[i];
+
+ float dx = vertex_positions[i].x - component_errors[c * 4 + 0];
+ float dy = vertex_positions[i].y - component_errors[c * 4 + 1];
+ float dz = vertex_positions[i].z - component_errors[c * 4 + 2];
+ float r = dx * dx + dy * dy + dz * dz;
+
+ component_errors[c * 4 + 3] = component_errors[c * 4 + 3] < r ? r : component_errors[c * 4 + 3];
+ }
+
+ // we've used the output buffer as scratch space, so we need to move the results to proper indices
+ for (size_t i = 0; i < component_count; ++i)
+ {
+#if TRACE >= 2
+ printf("component %d: center %f %f %f, error %e\n", int(i),
+ component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3]));
+#endif
+ // note: we keep the squared error to make it match quadric error metric
+ component_errors[i] = component_errors[i * 4 + 3];
+ }
+}
+
+static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror)
+{
+ (void)component_count;
+
+ size_t write = 0;
+ float min_error = FLT_MAX;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int v0 = indices[i + 0], v1 = indices[i + 1], v2 = indices[i + 2];
+ unsigned int c = components[v0];
+ assert(c == components[v1] && c == components[v2]);
+
+ if (component_errors[c] > error_cutoff)
+ {
+ min_error = min_error > component_errors[c] ? component_errors[c] : min_error;
+
+ indices[write + 0] = v0;
+ indices[write + 1] = v1;
+ indices[write + 2] = v2;
+ write += 3;
+ }
+ }
+
+#if TRACE
+ size_t pruned_components = 0;
+ for (size_t i = 0; i < component_count; ++i)
+ pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff);
+
+ printf("pruned %d triangles in %d components (goal %e); next %e\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff), min_error < FLT_MAX ? sqrtf(min_error) : min_error * 2);
+#endif
+
+ // update next error with the smallest error of the remaining components
+ nexterror = min_error;
+ return write;
+}
+
struct CellHasher
{
const unsigned int* vertex_ids;
@@ -1299,7 +2080,7 @@ struct TriangleHasher
}
};
-static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, const unsigned char* vertex_lock, size_t vertex_count, int grid_size)
{
assert(grid_size >= 1 && grid_size <= 1024);
float cell_scale = float(grid_size - 1);
@@ -1312,7 +2093,10 @@ static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_pos
int yi = int(v.y * cell_scale + 0.5f);
int zi = int(v.z * cell_scale + 0.5f);
- vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+ if (vertex_lock && (vertex_lock[i] & meshopt_SimplifyVertex_Lock))
+ vertex_ids[i] = (1 << 30) | unsigned(i);
+ else
+ vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
}
}
@@ -1541,17 +2325,17 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
// three point interpolation from "revenge of interpolation search" paper
float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
- return x1 + num / den;
+ return x1 + (den == 0.f ? 0.f : num / den);
}
} // namespace meshopt
-#ifndef NDEBUG
-// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds
-MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = NULL;
-#endif
+// Note: this is only exposed for development purposes; do *not* use
+enum
+{
+ meshopt_SimplifyInternalSolve = 1 << 29,
+ meshopt_SimplifyInternalDebug = 1 << 30
+};
size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
@@ -1561,10 +2345,13 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(target_index_count <= index_count);
- assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute)) == 0);
+ assert(target_error >= 0);
+ assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyRegularize | meshopt_SimplifyPermissive | meshopt_SimplifyInternalSolve | meshopt_SimplifyInternalDebug)) == 0);
assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256);
assert(vertex_attributes_stride % sizeof(float) == 0);
assert(attribute_count <= kMaxAttributes);
+ for (size_t i = 0; i < attribute_count; ++i)
+ assert(attribute_weights[i] >= 0);
meshopt_Allocator allocator;
@@ -1584,6 +2371,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL);
// build position remap that maps each vertex to the one with identical position
+ // wedge table stores next vertex with identical position for each vertex
unsigned int* remap = allocator.allocate(vertex_count);
unsigned int* wedge = allocator.allocate(vertex_count);
buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator);
@@ -1610,14 +2398,23 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
#endif
Vector3* vertex_positions = allocator.allocate(vertex_count);
- float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap);
+ float vertex_offset[3] = {};
+ float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, vertex_offset);
float* vertex_attributes = NULL;
+ unsigned int attribute_remap[kMaxAttributes];
if (attribute_count)
{
+ // remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics
+ size_t attributes_used = 0;
+ for (size_t i = 0; i < attribute_count; ++i)
+ if (attribute_weights[i] > 0)
+ attribute_remap[attributes_used++] = unsigned(i);
+
+ attribute_count = attributes_used;
vertex_attributes = allocator.allocate(vertex_count * attribute_count);
- rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, sparse_remap);
+ rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, attribute_remap, sparse_remap);
}
Quadric* vertex_quadrics = allocator.allocate(vertex_count);
@@ -1625,6 +2422,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
Quadric* attribute_quadrics = NULL;
QuadricGrad* attribute_gradients = NULL;
+ QuadricGrad* volume_gradients = NULL;
if (attribute_count)
{
@@ -1633,13 +2431,42 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
attribute_gradients = allocator.allocate(vertex_count * attribute_count);
memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad));
+
+ if (options & meshopt_SimplifyInternalSolve)
+ {
+ volume_gradients = allocator.allocate(vertex_count);
+ memset(volume_gradients, 0, vertex_count * sizeof(QuadricGrad));
+ }
}
- fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap);
+ fillFaceQuadrics(vertex_quadrics, volume_gradients, result, index_count, vertex_positions, remap);
+ fillVertexQuadrics(vertex_quadrics, vertex_positions, vertex_count, remap, options);
fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
if (attribute_count)
- fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count, remap);
+ fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count);
+
+ unsigned int* components = NULL;
+ float* component_errors = NULL;
+ size_t component_count = 0;
+ float component_nexterror = 0;
+
+ if (options & meshopt_SimplifyPrune)
+ {
+ components = allocator.allocate(vertex_count);
+ component_count = buildComponents(components, vertex_count, result, index_count, remap);
+
+ component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents
+ measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+ component_nexterror = FLT_MAX;
+ for (size_t i = 0; i < component_count; ++i)
+ component_nexterror = component_nexterror > component_errors[i] ? component_errors[i] : component_nexterror;
+
+#if TRACE
+ printf("components: %d (min error %e)\n", int(component_count), sqrtf(component_nexterror));
+#endif
+ }
#if TRACE
size_t pass_count = 0;
@@ -1654,6 +2481,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
size_t result_count = index_count;
float result_error = 0;
+ float vertex_error = 0;
// target_error input is linear; we need to adjust it to match quadricError units
float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f;
@@ -1664,14 +2492,18 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
// note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
- size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop);
+ size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop, loopback);
assert(edge_collapse_count <= collapse_capacity);
// no edges can be collapsed any more due to topology restrictions
if (edge_collapse_count == 0)
break;
- rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
+#if TRACE
+ printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
+#endif
+
+ rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback);
sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
@@ -1682,39 +2514,101 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
memset(collapse_locked, 0, vertex_count);
-#if TRACE
- printf("pass %d: ", int(pass_count++));
-#endif
-
- size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
+ size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
if (collapses == 0)
break;
+ updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error);
+
+ // updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent
+ vertex_error = attribute_count == 0 ? result_error : vertex_error;
+
+ // note: we update loops following edge collapses, but after this we might still have stale loop data
+ // this can happen when a triangle with a loop edge gets collapsed along a non-loop edge
+ // that works since a loop that points to a vertex that is no longer connected is not affecting collapse logic
remapEdgeLoops(loop, vertex_count, collapse_remap);
remapEdgeLoops(loopback, vertex_count, collapse_remap);
- size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
- assert(new_count < result_count);
+ result_count = remapIndexBuffer(result, result_count, collapse_remap, remap);
+ if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error)
+ result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror);
+ }
+
+ // at this point, component_nexterror might be stale: component it references may have been removed through a series of edge collapses
+ bool component_nextstale = true;
+
+ // we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit
+ while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit)
+ {
+#if TRACE
+ printf("pass %d: cleanup; ", int(pass_count++));
+#endif
+
+ float component_cutoff = component_nexterror * 1.5f < error_limit ? component_nexterror * 1.5f : error_limit;
+
+ // track maximum error in eligible components as we are increasing resulting error
+ float component_maxerror = 0;
+ for (size_t i = 0; i < component_count; ++i)
+ if (component_errors[i] > component_maxerror && component_errors[i] <= component_cutoff)
+ component_maxerror = component_errors[i];
+
+ size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror);
+ if (new_count == result_count && !component_nextstale)
+ break;
+
+ component_nextstale = false; // pruneComponents guarantees next error is up to date
result_count = new_count;
+ result_error = result_error < component_maxerror ? component_maxerror : result_error;
+ vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error;
}
#if TRACE
- printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count));
+ printf("result: %d triangles, error: %e (pos %.3e); total %d passes\n", int(result_count / 3), sqrtf(result_error), sqrtf(vertex_error), int(pass_count));
#endif
-#ifndef NDEBUG
- if (meshopt_simplifyDebugKind)
- memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+ // if solve is requested, update input buffers destructively from internal data
+ if (options & meshopt_SimplifyInternalSolve)
+ {
+ unsigned char* vertex_update = collapse_locked; // reuse as scratch space
+ memset(vertex_update, 0, vertex_count);
- if (meshopt_simplifyDebugLoop)
- memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+ // limit quadric solve to vertices that are still used in the result
+ for (size_t i = 0; i < result_count; ++i)
+ {
+ unsigned int v = result[i];
- if (meshopt_simplifyDebugLoopBack)
- memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
-#endif
+ // mark the vertex for finalizeVertices and root vertex for solve*
+ vertex_update[remap[v]] = vertex_update[v] = 1;
+ }
+
+ // edge adjacency may be stale as we haven't updated it after last series of edge collapses
+ updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
+
+ solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
+
+ if (attribute_count)
+ solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
+
+ finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
+ }
+
+ // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
+ if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap)
+ {
+ assert(Kind_Count <= 8 && vertex_count < (1 << 28)); // 3 bit kind, 1 bit loop
+
+ for (size_t i = 0; i < result_count; i += 3)
+ {
+ unsigned int a = result[i + 0], b = result[i + 1], c = result[i + 2];
+
+ result[i + 0] |= (vertex_kind[a] << 28) | (unsigned(loop[a] == b || loopback[b] == a) << 31);
+ result[i + 1] |= (vertex_kind[b] << 28) | (unsigned(loop[b] == c || loopback[c] == b) << 31);
+ result[i + 2] |= (vertex_kind[c] << 28) | (unsigned(loop[c] == a || loopback[a] == c) << 31);
+ }
+ }
// convert resulting indices back into the dense space of the larger mesh
if (sparse_remap)
@@ -1730,15 +2624,24 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
+ assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error);
}
size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
{
+ assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error);
}
-size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
+size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+{
+ return meshopt_simplifyEdge(indices, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options | meshopt_SimplifyInternalSolve, out_result_error);
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* out_result_error)
{
using namespace meshopt;
@@ -1766,15 +2669,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
const int kInterpolationPasses = 5;
// invariant: # of triangles in min_grid <= target_count
- int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error));
+ int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : (target_error < 1.f ? target_error : 1.f)));
int max_grid = 1025;
size_t min_triangles = 0;
size_t max_triangles = index_count / 3;
// when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid
- if (min_grid > 1)
+ if (min_grid > 1 || vertex_lock)
{
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
min_triangles = countTriangles(vertex_ids, indices, index_count);
}
@@ -1790,7 +2693,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
int grid_size = next_grid_size;
grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, grid_size);
size_t triangles = countTriangles(vertex_ids, indices, index_count);
#if TRACE
@@ -1800,7 +2703,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
(triangles <= target_index_count / 3) ? "under" : "over");
#endif
- float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+ float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
if (triangles <= target_index_count / 3)
{
@@ -1832,7 +2735,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
unsigned int* vertex_cells = allocator.allocate(vertex_count);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
// build a quadric for each target cell
@@ -1853,15 +2756,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
for (size_t i = 0; i < cell_count; ++i)
result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error;
- // collapse triangles!
- // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+ // vertex collapses often result in duplicate triangles; we need a table to filter them out
size_t tritable_size = hashBuckets2(min_triangles);
unsigned int* tritable = allocator.allocate(tritable_size);
+ // note: this is the first and last write to destination, which allows aliasing destination with indices
size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
#if TRACE
- printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
+ printf("result: grid size %d, %d cells, %d triangles (%d unfiltered), error %e\n", min_grid, int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
#endif
if (out_result_error)
@@ -1870,6 +2773,40 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
return write;
}
+size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_error >= 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* result = destination;
+ if (result != indices)
+ memcpy(result, indices, index_count * sizeof(unsigned int));
+
+ // build position remap that maps each vertex to the one with identical position
+ unsigned int* remap = allocator.allocate(vertex_count);
+ buildPositionRemap(remap, NULL, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, allocator);
+
+ Vector3* vertex_positions = allocator.allocate(vertex_count);
+ rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, NULL);
+
+ unsigned int* components = allocator.allocate(vertex_count);
+ size_t component_count = buildComponents(components, vertex_count, indices, index_count, remap);
+
+ float* component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents
+ measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+ float component_nexterror = 0;
+ size_t result_count = pruneComponents(result, index_count, components, component_errors, component_count, target_error * target_error, component_nexterror);
+
+ return result_count;
+}
+
size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count)
{
using namespace meshopt;
@@ -1922,7 +2859,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
int grid_size = next_grid_size;
grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, grid_size);
size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
#if TRACE
@@ -1959,7 +2896,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
unsigned int* vertex_cells = allocator.allocate(vertex_count);
- computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, min_grid);
size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
// accumulate points into a reservoir for each target cell
@@ -1972,7 +2909,10 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
unsigned int* cell_remap = allocator.allocate(cell_count);
float* cell_errors = allocator.allocate(cell_count);
- fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight * color_weight, vertex_count);
+ // we scale the color weight to bring it to the same scale as position so that error addition makes sense
+ float color_weight_scaled = color_weight * (min_grid == 1 ? 1.f : 1.f / (min_grid - 1));
+
+ fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight_scaled * color_weight_scaled, vertex_count);
// copy results to the output
assert(cell_count <= target_vertex_count);
diff --git a/Source/ThirdParty/meshoptimizer/spatialorder.cpp b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
index 7b1a06945..8a785fcd5 100644
--- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp
+++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
@@ -10,18 +10,19 @@
namespace meshopt
{
-// "Insert" two 0 bits after each of the 10 low bits of x
-inline unsigned int part1By2(unsigned int x)
+// "Insert" two 0 bits after each of the 20 low bits of x
+inline unsigned long long part1By2(unsigned long long x)
{
- x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
- x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
- x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
- x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
- x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+ x &= 0x000fffffull; // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
+ x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
+ x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
+ x = (x ^ (x << 8)) & 0x000f00f00f00f00full; // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
+ x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull; // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
+ x = (x ^ (x << 2)) & 0x0249249249249249ull; // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
-static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
@@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat
extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
- float scale = extent == 0 ? 0.f : 1.f / extent;
+ // rescale each axis to 16 bits to get 48-bit Morton codes
+ float scale = extent == 0 ? 0.f : 65535.f / extent;
// generate Morton order based on the position inside a unit cube
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions_data + i * vertex_stride_float;
- int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
- int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
- int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+ int x = int((v[0] - minv[0]) * scale + 0.5f);
+ int y = int((v[1] - minv[1]) * scale + 0.5f);
+ int z = int((v[2] - minv[2]) * scale + 0.5f);
- result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+ if (morton)
+ result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+ else
+ result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
}
}
-static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
{
+ unsigned int hist[1024];
memset(hist, 0, sizeof(hist));
- // compute 3 10-bit histograms in parallel
+ // compute histogram (assume keys are 10-bit)
for (size_t i = 0; i < count; ++i)
- {
- unsigned int id = data[i];
+ hist[keys[i]]++;
- hist[(id >> 0) & 1023][0]++;
- hist[(id >> 10) & 1023][1]++;
- hist[(id >> 20) & 1023][2]++;
- }
-
- unsigned int sumx = 0, sumy = 0, sumz = 0;
+ unsigned int sum = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 1024; ++i)
{
- unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
-
- hist[i][0] = sumx;
- hist[i][1] = sumy;
- hist[i][2] = sumz;
-
- sumx += hx;
- sumy += hy;
- sumz += hz;
+ unsigned int h = hist[i];
+ hist[i] = sum;
+ sum += h;
}
- assert(sumx == count && sumy == count && sumz == count);
+ assert(sum == count);
+
+ // reorder values
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = keys[source[i]];
+
+ destination[hist[id]++] = source[i];
+ }
}
-static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
{
- int bitoff = pass * 10;
+ memset(hist, 0, sizeof(hist));
+
+ // compute 2 8-bit histograms in parallel
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned long long id = data[i];
+
+ hist[(id >> 0) & 255][0]++;
+ hist[(id >> 8) & 255][1]++;
+ }
+
+ unsigned int sum0 = 0, sum1 = 0;
+
+ // replace histogram data with prefix histogram sums in-place
+ for (int i = 0; i < 256; ++i)
+ {
+ unsigned int h0 = hist[i][0], h1 = hist[i][1];
+
+ hist[i][0] = sum0;
+ hist[i][1] = sum1;
+
+ sum0 += h0;
+ sum1 += h1;
+ }
+
+ assert(sum0 == count && sum1 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
+{
+ int bitoff = pass * 8;
for (size_t i = 0; i < count; ++i)
{
- unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+ unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
destination[hist[id][pass]++] = source[i];
}
}
+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+ size_t l = 0, r = split;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned char side = sides[order[i]];
+ target[side ? r : l] = order[i];
+ l += 1;
+ l -= side;
+ r += side;
+ }
+
+ assert(l == split && r == count);
+}
+
+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
+{
+ if (count <= cluster_size)
+ {
+ memcpy(destination, orderx, count * sizeof(unsigned int));
+ return;
+ }
+
+ unsigned int* axes[3] = {orderx, ordery, orderz};
+
+ int bestk = -1;
+ unsigned int bestdim = 0;
+
+ for (int k = 0; k < 3; ++k)
+ {
+ const unsigned int mask = (1 << 20) - 1;
+ unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
+
+ if (dim >= bestdim)
+ {
+ bestk = k;
+ bestdim = dim;
+ }
+ }
+
+ assert(bestk >= 0);
+
+ // split roughly in half, with the left split always being aligned to cluster size
+ size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
+ assert(split > 0 && split < count);
+
+ // mark sides of split for partitioning
+ unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int);
+
+ for (size_t i = 0; i < split; ++i)
+ sides[axes[bestk][i]] = 0;
+
+ for (size_t i = split; i < count; ++i)
+ sides[axes[bestk][i]] = 1;
+
+ // partition all axes into two sides, maintaining order
+ unsigned int* temp = static_cast(scratch);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ if (k == bestk)
+ continue;
+
+ unsigned int* axis = axes[k];
+ memcpy(temp, axis, sizeof(unsigned int) * count);
+ partitionPoints(axis, temp, sides, split, count);
+ }
+
+ // recursion depth is logarithmic and bounded as we always split in approximately half
+ splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
+ splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
+}
+
} // namespace meshopt
void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
meshopt_Allocator allocator;
- unsigned int* keys = allocator.allocate(vertex_count);
- computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+ unsigned long long* keys = allocator.allocate(vertex_count);
+ computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
- unsigned int hist[1024][3];
- computeHistogram(hist, keys, vertex_count);
-
- unsigned int* scratch = allocator.allocate(vertex_count);
+ unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 2b for keys
+ unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
destination[i] = unsigned(i);
- // 3-pass radix sort computes the resulting order into scratch
- radixPass(scratch, destination, keys, vertex_count, hist, 0);
- radixPass(destination, scratch, keys, vertex_count, hist, 1);
- radixPass(scratch, destination, keys, vertex_count, hist, 2);
+ unsigned int* order[] = {scratch, destination};
+
+ // 5-pass radix sort computes the resulting order into scratch
+ for (int k = 0; k < 5; ++k)
+ {
+ // copy 10-bit key segments into keyk to reduce cache pressure during radix pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
+
+ radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
+ }
// since our remap table is mapping old=>new, we need to reverse it
for (size_t i = 0; i < vertex_count; ++i)
@@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
destination[r * 3 + 2] = c;
}
}
+
+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(cluster_size > 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned long long* keys = allocator.allocate(vertex_count);
+ computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
+
+ unsigned int* order = allocator.allocate(vertex_count * 3);
+ unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
+ unsigned short* keyk = reinterpret_cast(scratch + vertex_count);
+
+ for (int k = 0; k < 3; ++k)
+ {
+ // copy 16-bit key segments into keyk to reduce cache pressure during radix pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ keyk[i] = (unsigned short)(keys[i] >> (k * 20));
+
+ unsigned int hist[256][2];
+ computeHistogram(hist, keyk, vertex_count);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ order[k * vertex_count + i] = unsigned(i);
+
+ radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
+ radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
+ }
+
+ splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
+}
diff --git a/Source/ThirdParty/meshoptimizer/stripifier.cpp b/Source/ThirdParty/meshoptimizer/stripifier.cpp
index d57fb512b..4043195ae 100644
--- a/Source/ThirdParty/meshoptimizer/stripifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/stripifier.cpp
@@ -10,14 +10,14 @@
namespace meshopt
{
-static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
{
unsigned int index = 0;
unsigned int iv = ~0u;
for (size_t i = 0; i < buffer_size; ++i)
{
- unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+ unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
if (v < iv)
@@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
size_t strip_size = 0;
// compute vertex valence; this is used to prioritize starting triangle for strips
- unsigned int* valence = allocator.allocate(vertex_count);
- memset(valence, 0, vertex_count * sizeof(unsigned int));
+ // note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+ unsigned char* valence = allocator.allocate(vertex_count);
+ memset(valence, 0, vertex_count);
for (size_t i = 0; i < index_count; ++i)
{
@@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
{
// if we didn't find anything, we need to find the next new triangle
// we use a heuristic to maximize the strip length
- unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+ unsigned int i = findStripFirst(buffer, buffer_size, valence);
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
// ordered removal from the buffer
diff --git a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
index 94f7a1adc..7085cce32 100644
--- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
@@ -60,6 +60,15 @@
#define SIMD_LATENCYOPT
#endif
+// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
+#if defined(__GNUC__)
+#define SIMD_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define SIMD_UNREACHABLE() __assume(false)
+#else
+#define SIMD_UNREACHABLE() assert(!"Unreachable")
+#endif
+
#endif // !MESHOPTIMIZER_NO_SIMD
#ifdef SIMD_SSE
@@ -90,6 +99,14 @@
#include
#endif
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include
+#endif
+
#ifdef SIMD_WASM
#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
@@ -105,50 +122,76 @@ namespace meshopt
const unsigned char kVertexHeader = 0xa0;
-static int gEncodeVertexVersion = 0;
+static int gEncodeVertexVersion = 1;
+const int kDecodeVertexVersion = 1;
const size_t kVertexBlockSizeBytes = 8192;
const size_t kVertexBlockMaxSize = 256;
const size_t kByteGroupSize = 16;
const size_t kByteGroupDecodeLimit = 24;
-const size_t kTailMaxSize = 32;
+const size_t kTailMinSizeV0 = 32;
+const size_t kTailMinSizeV1 = 24;
+
+static const int kBitsV0[4] = {0, 2, 4, 8};
+static const int kBitsV1[5] = {0, 1, 2, 4, 8};
+
+const int kEncodeDefaultLevel = 2;
static size_t getVertexBlockSize(size_t vertex_size)
{
- // make sure the entire block fits into the scratch buffer
- size_t result = kVertexBlockSizeBytes / vertex_size;
-
- // align to byte group size; we encode each byte as a byte group
- // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
- result &= ~(kByteGroupSize - 1);
+ // make sure the entire block fits into the scratch buffer and is aligned to byte group size
+ // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
+ size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
}
-inline unsigned char zigzag8(unsigned char v)
+inline unsigned int rotate(unsigned int v, int r)
{
- return ((signed char)(v) >> 7) ^ (v << 1);
+ return (v << r) | (v >> ((32 - r) & 31));
}
-inline unsigned char unzigzag8(unsigned char v)
+template
+inline T zigzag(T v)
{
- return -(v & 1) ^ (v >> 1);
+ return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
}
+template
+inline T unzigzag(T v)
+{
+ return (0 - (v & 1)) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+ size_t size;
+ size_t header; // bytes for header
+ size_t bitg[9]; // bytes for bit groups
+ size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+ size_t ctrl[4]; // number of control groups
+};
+
+static Stats* bytestats = NULL;
+static Stats vertexstats[256];
+#endif
+
static bool encodeBytesGroupZero(const unsigned char* buffer)
{
- for (size_t i = 0; i < kByteGroupSize; ++i)
- if (buffer[i])
- return false;
+ assert(kByteGroupSize == sizeof(unsigned long long) * 2);
- return true;
+ unsigned long long v[2];
+ memcpy(v, buffer, sizeof(v));
+
+ return (v[0] | v[1]) == 0;
}
static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
{
- assert(bits >= 1 && bits <= 8);
+ assert(bits >= 0 && bits <= 8);
- if (bits == 1)
+ if (bits == 0)
return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
if (bits == 8)
@@ -166,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
{
- assert(bits >= 1 && bits <= 8);
+ assert(bits >= 0 && bits <= 8);
+ assert(kByteGroupSize % 8 == 0);
- if (bits == 1)
+ if (bits == 0)
return data;
if (bits == 8)
@@ -196,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char*
byte |= enc;
}
+ // encode 1-bit groups in reverse bit order
+ // this makes them faster to decode alongside other groups
+ if (bits == 1)
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+
*data++ = byte;
}
for (size_t i = 0; i < kByteGroupSize; ++i)
{
- if (buffer[i] >= sentinel)
- {
- *data++ = buffer[i];
- }
+ unsigned char v = buffer[i];
+
+ // branchless append of out-of-range values
+ *data = v;
+ data += v >= sentinel;
}
return data;
}
-static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
{
assert(buffer_size % kByteGroupSize == 0);
@@ -226,69 +276,301 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
memset(header, 0, header_size);
+ int last_bits = -1;
+
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
{
if (size_t(data_end - data) < kByteGroupDecodeLimit)
return NULL;
- int best_bits = 8;
- size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+ int best_bitk = 3;
+ size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
- for (int bits = 1; bits < 8; bits *= 2)
+ for (int bitk = 0; bitk < 3; ++bitk)
{
- size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+ size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
- if (size < best_size)
+ // favor consistent bit selection across groups, but never replace literals
+ if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
{
- best_bits = bits;
+ best_bitk = bitk;
best_size = size;
}
}
- int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
- assert((1 << bitslog2) == best_bits);
-
size_t header_offset = i / kByteGroupSize;
+ header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
- header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
-
+ int best_bits = bits[best_bitk];
unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
assert(data + best_size == next);
data = next;
+ last_bits = best_bits;
+
+#if TRACE
+ bytestats->bitg[best_bits] += best_size;
+#endif
}
+#if TRACE
+ bytestats->header += header_size;
+#endif
+
return data;
}
-static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template
+static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
+{
+ size_t k0 = k & ~(sizeof(T) - 1);
+ int ks = (k & (sizeof(T) - 1)) * 8;
+
+ T p = last_vertex[k0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ p |= T(last_vertex[k0 + j]) << (j * 8);
+
+ const unsigned char* vertex = vertex_data + k0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ T v = vertex[0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ v |= vertex[j] << (j * 8);
+
+ T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
+
+ buffer[i] = (unsigned char)(d >> ks);
+ p = v;
+ vertex += vertex_size;
+ }
+}
+
+static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
+{
+ switch (channel & 3)
+ {
+ case 0:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+ case 1:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+ case 2:
+ return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
+ default:
+ assert(!"Unsupported channel encoding"); // unreachable
+ }
+}
+
+static int estimateBits(unsigned char v)
+{
+ return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
+}
+
+static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
+{
+ size_t sizes[8] = {};
+
+ const unsigned char* vertex = vertex_data + k;
+ unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+
+ for (size_t i = 0; i < vertex_count; i += group_size)
+ {
+ unsigned int bitg = 0;
+
+ // calculate bit consistency mask for the group
+ for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
+ {
+ unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+ unsigned int d = v ^ last;
+
+ bitg |= d;
+ last = v;
+ vertex += vertex_size;
+ }
+
+#if TRACE
+ for (int j = 0; j < 32; ++j)
+ vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
+#endif
+
+ for (int j = 0; j < 8; ++j)
+ {
+ unsigned int bitr = rotate(bitg, j);
+
+ sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
+ sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
+ }
+ }
+
+ int best_rot = 0;
+ for (int rot = 1; rot < 8; ++rot)
+ best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
+
+ return best_rot;
+}
+
+static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
+{
+ unsigned char block[kVertexBlockMaxSize];
+ assert(vertex_block_size <= kVertexBlockMaxSize);
+
+ unsigned char last_vertex[256] = {};
+
+ size_t sizes[3] = {};
+ assert(max_channel <= 3);
+
+ for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
+ {
+ size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
+ size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+ memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
+
+ // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+ if (block_size < block_size_aligned)
+ memset(block + block_size, 0, block_size_aligned - block_size);
+
+ for (int channel = 0; channel < max_channel; ++channel)
+ for (size_t j = 0; j < 4; ++j)
+ {
+ encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
+
+ for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
+ {
+ // to maximize encoding performance we only evaluate 1/2/4/8 bit groups
+ size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
+ size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
+ size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
+ size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
+
+ size_t best_size = size1 < size2 ? size1 : size2;
+ best_size = best_size < size4 ? best_size : size4;
+ best_size = best_size < size8 ? best_size : size8;
+
+ sizes[channel] += best_size;
+ }
+ }
+ }
+
+ int best_channel = 0;
+ for (int channel = 1; channel < max_channel; ++channel)
+ best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
+
+ return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
+}
+
+static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
+{
+ for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+ if (!encodeBytesGroupZero(buffer + i))
+ return false;
+
+ return true;
+}
+
+static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
+{
+ if (estimateControlZero(buffer, vertex_count_aligned))
+ return 2; // zero encoding
+
+ if (level == 0)
+ return 1; // 1248 encoding in level 0 for encoding speed
+
+ // round number of groups to 4 to get number of header bytes
+ size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
+
+ size_t est_bytes0 = header_size, est_bytes1 = header_size;
+
+ for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+ {
+ // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
+ size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
+ size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
+ size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
+ size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
+ size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
+
+ // both control modes have access to 1/2/4 bit encoding
+ size_t size12 = size1 < size2 ? size1 : size2;
+ size_t size124 = size12 < size4 ? size12 : size4;
+
+ // each control mode has access to 0/8 bit encoding respectively
+ est_bytes0 += size124 < size0 ? size124 : size0;
+ est_bytes1 += size124 < size8 ? size124 : size8;
+ }
+
+ // pick shortest control entry but prefer literal encoding
+ if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
+ return est_bytes0 < est_bytes1 ? 0 : 1;
+ else
+ return 3; // literal encoding
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
{
assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+ assert(vertex_size % 4 == 0);
unsigned char buffer[kVertexBlockMaxSize];
assert(sizeof(buffer) % kByteGroupSize == 0);
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
memset(buffer, 0, sizeof(buffer));
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ unsigned char* control = data;
+ data += control_size;
+
+ memset(control, 0, control_size);
+
for (size_t k = 0; k < vertex_size; ++k)
{
- size_t vertex_offset = k;
+ encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
- unsigned char p = last_vertex[k];
+#if TRACE
+ const unsigned char* olddata = data;
+ bytestats = &vertexstats[k];
+#endif
- for (size_t i = 0; i < vertex_count; ++i)
+ int ctrl = 0;
+
+ if (version != 0)
{
- buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+ ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
- p = vertex_data[vertex_offset];
+ assert(unsigned(ctrl) < 4);
+ control[k / 4] |= ctrl << ((k % 4) * 2);
- vertex_offset += vertex_size;
+#if TRACE
+ vertexstats[k].ctrl[ctrl]++;
+#endif
}
- data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
- if (!data)
- return NULL;
+ if (ctrl == 3)
+ {
+ // literal encoding
+ if (size_t(data_end - data) < vertex_count)
+ return NULL;
+
+ memcpy(data, buffer, vertex_count);
+ data += vertex_count;
+ }
+ else if (ctrl != 2) // non-zero encoding
+ {
+ data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+ if (!data)
+ return NULL;
+ }
+
+#if TRACE
+ bytestats = NULL;
+ vertexstats[k].size += data - olddata;
+#endif
}
memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -297,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
}
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
-static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
{
#define READ() byte = *data++
#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
@@ -305,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
unsigned char byte, enc, encv;
const unsigned char* data_var;
- switch (bitslog2)
+ switch (bits)
{
case 0:
memset(buffer, 0, kByteGroupSize);
return data;
case 1:
+ data_var = data + 2;
+
+ // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
+ READ();
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+ NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+ READ();
+ byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+ NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+
+ return data_var;
+ case 2:
data_var = data + 4;
// 4 groups with 4 2-bit values in each byte
@@ -320,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
return data_var;
- case 2:
+ case 4:
data_var = data + 8;
// 8 groups with 2 4-bit values in each byte
@@ -334,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
READ(), NEXT(4), NEXT(4);
return data_var;
- case 3:
+ case 8:
memcpy(buffer, data, kByteGroupSize);
return data + kByteGroupSize;
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ assert(!"Unexpected bit length"); // unreachable
return data;
}
@@ -346,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
#undef NEXT
}
-static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
{
assert(buffer_size % kByteGroupSize == 0);
- const unsigned char* header = data;
-
// round number of groups to 4 to get number of header bytes
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
if (size_t(data_end - data) < header_size)
return NULL;
+ const unsigned char* header = data;
data += header_size;
for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
@@ -366,43 +658,109 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
return NULL;
size_t header_offset = i / kByteGroupSize;
+ int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
- int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
- data = decodeBytesGroup(data, buffer + i, bitslog2);
+ data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
}
return data;
}
-static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template
+static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
{
- assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
-
- unsigned char buffer[kVertexBlockMaxSize];
- unsigned char transposed[kVertexBlockSizeBytes];
-
- size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
-
- for (size_t k = 0; k < vertex_size; ++k)
+ for (size_t k = 0; k < 4; k += sizeof(T))
{
- data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
- if (!data)
- return NULL;
-
size_t vertex_offset = k;
- unsigned char p = last_vertex[k];
+ T p = last_vertex[0];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ p |= last_vertex[j] << (8 * j);
for (size_t i = 0; i < vertex_count; ++i)
{
- unsigned char v = unzigzag8(buffer[i]) + p;
+ T v = buffer[i];
+ for (size_t j = 1; j < sizeof(T); ++j)
+ v |= buffer[i + vertex_count * j] << (8 * j);
+
+ v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
+
+ for (size_t j = 0; j < sizeof(T); ++j)
+ transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
- transposed[vertex_offset] = v;
p = v;
vertex_offset += vertex_size;
}
+
+ buffer += vertex_count * sizeof(T);
+ last_vertex += sizeof(T);
+ }
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
+{
+ assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+ unsigned char buffer[kVertexBlockMaxSize * 4];
+ unsigned char transposed[kVertexBlockSizeBytes];
+
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+ assert(vertex_count <= vertex_count_aligned);
+
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ const unsigned char* control = data;
+ data += control_size;
+
+ for (size_t k = 0; k < vertex_size; k += 4)
+ {
+ unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
+ for (size_t j = 0; j < 4; ++j)
+ {
+ int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+ if (ctrl == 3)
+ {
+ // literal encoding
+ if (size_t(data_end - data) < vertex_count)
+ return NULL;
+
+ memcpy(buffer + j * vertex_count, data, vertex_count);
+ data += vertex_count;
+ }
+ else if (ctrl == 2)
+ {
+ // zero encoding
+ memset(buffer + j * vertex_count, 0, vertex_count);
+ }
+ else
+ {
+ data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+ if (!data)
+ return NULL;
+ }
+ }
+
+ int channel = version == 0 ? 0 : channels[k / 4];
+
+ switch (channel & 3)
+ {
+ case 0:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+ break;
+ case 1:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+ break;
+ case 2:
+ decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+ break;
+ default:
+ return NULL; // invalid channel type
+ }
}
memcpy(vertex_data, transposed, vertex_count * vertex_size);
@@ -447,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
#ifdef SIMD_SSE
SIMD_TARGET
-static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{
__m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0]));
__m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1]));
@@ -459,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
}
SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
__m128i result = _mm_setzero_si128();
@@ -473,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
#ifdef __GNUC__
typedef int __attribute__((aligned(1))) unaligned_int;
@@ -505,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
unsigned char mask1 = (unsigned char)(mask16 >> 8);
__m128i shuf = decodeShuffleMask(mask0, mask1);
-
__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -518,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
#ifdef SIMD_LATENCYOPT
unsigned long long data64;
@@ -541,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
unsigned char mask1 = (unsigned char)(mask16 >> 8);
__m128i shuf = decodeShuffleMask(mask0, mask1);
-
__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -554,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
__m128i result = _mm_loadu_si128(reinterpret_cast(data));
@@ -562,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2));
+
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ __m128i shuf = decodeShuffleMask(mask0, mask1);
+ __m128i result = _mm_shuffle_epi8(rest, shuf);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_AVX
-static const __m128i decodeBytesGroupConfig[] = {
- _mm_set1_epi8(3),
- _mm_set1_epi8(15),
- _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
- _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+static const __m128i kDecodeBytesGroupConfig[8][2] = {
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+ {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_setzero_si128(), _mm_setzero_si128()},
+ {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
+ {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+ {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
};
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
__m128i result = _mm_setzero_si128();
@@ -590,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data;
}
- case 1:
- case 2:
+ case 5: // 1-bit
+ case 1: // 2-bit
+ case 6:
+ case 2: // 4-bit
+ case 7:
{
- const unsigned char* skip = data + (bitslog2 << 2);
+ const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
__m128i selb = _mm_loadl_epi64(reinterpret_cast(data));
__m128i rest = _mm_loadu_si128(reinterpret_cast(skip));
- __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
- __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+ __m128i sent = kDecodeBytesGroupConfig[hbits][0];
+ __m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
__m128i selw = _mm_shuffle_epi32(selb, 0x44);
__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
@@ -613,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
__m128i result = _mm_loadu_si128(reinterpret_cast(data));
@@ -622,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_NEON
-static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+SIMD_TARGET
+inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
{
uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
@@ -640,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
return vcombine_u8(r0, r1);
}
-static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+SIMD_TARGET
+inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
{
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
const uint64_t magic = 0x000103070f1f3f80ull;
@@ -651,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
}
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
uint8x16_t result = vdupq_n_u8(0);
@@ -665,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
#ifdef SIMD_LATENCYOPT
unsigned int data32;
@@ -702,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
#ifdef SIMD_LATENCYOPT
unsigned long long data64;
@@ -736,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
uint8x16_t result = vld1q_u8(data);
@@ -744,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ uint8x8_t rest0 = vld1_u8(data + 2);
+ uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
+
+ uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
+
+ vst1q_u8(buffer, result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
-static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
{
v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
- v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
- sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
+ v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
return wasmx_unpacklo_v64x2(sm0, sm1r);
}
SIMD_TARGET
-static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
{
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
const uint64_t magic = 0x000103070f1f3f80ull;
@@ -777,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
}
SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
{
- switch (bitslog2)
+ switch (hbits)
{
case 0:
+ case 4:
{
v128_t result = wasm_i8x16_splat(0);
@@ -791,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 1:
+ case 6:
{
v128_t sel2 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 4);
@@ -805,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
-
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
@@ -814,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 2:
+ case 7:
{
v128_t sel4 = wasm_v128_load(data);
v128_t rest = wasm_v128_load(data + 8);
@@ -827,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
wasmMoveMask(mask, mask0, mask1);
v128_t shuf = decodeShuffleMask(mask0, mask1);
-
v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
wasm_v128_store(buffer, result);
@@ -836,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
case 3:
+ case 8:
{
v128_t result = wasm_v128_load(data);
@@ -844,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
return data + 16;
}
+ case 5:
+ {
+ v128_t rest = wasm_v128_load(data + 2);
+
+ unsigned char mask0 = data[0];
+ unsigned char mask1 = data[1];
+
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
+ v128_t result = wasm_i8x16_swizzle(rest, shuf);
+
+ wasm_v128_store(buffer, result);
+
+ return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
default:
- assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
- return data;
+ SIMD_UNREACHABLE(); // unreachable
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX)
SIMD_TARGET
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{
__m128i t0 = _mm_unpacklo_epi8(x0, x1);
__m128i t1 = _mm_unpackhi_epi8(x0, x1);
@@ -867,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
}
SIMD_TARGET
-static __m128i unzigzag8(__m128i v)
+inline __m128i unzigzag8(__m128i v)
{
__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
return _mm_xor_si128(xl, xr);
}
+
+SIMD_TARGET
+inline __m128i unzigzag16(__m128i v)
+{
+ __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
+ __m128i xr = _mm_srli_epi16(v, 1);
+
+ return _mm_xor_si128(xl, xr);
+}
+
+SIMD_TARGET
+inline __m128i rotate32(__m128i v, int r)
+{
+ return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
+}
#endif
#ifdef SIMD_NEON
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+SIMD_TARGET
+inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
{
uint8x16x2_t t01 = vzipq_u8(x0, x1);
uint8x16x2_t t23 = vzipq_u8(x2, x3);
@@ -891,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_
x3 = vreinterpretq_u8_u16(x23.val[1]);
}
-static uint8x16_t unzigzag8(uint8x16_t v)
+SIMD_TARGET
+inline uint8x16_t unzigzag8(uint8x16_t v)
{
uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
uint8x16_t xr = vshrq_n_u8(v, 1);
return veorq_u8(xl, xr);
}
+
+SIMD_TARGET
+inline uint8x16_t unzigzag16(uint8x16_t v)
+{
+ uint16x8_t vv = vreinterpretq_u16_u8(v);
+ uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
+ uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
+
+ return veorq_u8(xl, xr);
+}
+
+SIMD_TARGET
+inline uint8x16_t rotate32(uint8x16_t v, int r)
+{
+ uint32x4_t v32 = vreinterpretq_u32_u8(v);
+ return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
+}
+
+template
+SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
+{
+ switch (Channel)
+ {
+ case 0:
+ {
+ uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
+ uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+ return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+ }
+ case 1:
+ {
+ uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
+ uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
+ return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
+ }
+ case 2:
+ {
+ uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
+ uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+ return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+ }
+ default:
+ return npi;
+ }
+}
#endif
#ifdef SIMD_WASM
SIMD_TARGET
-static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
{
v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
@@ -916,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
}
SIMD_TARGET
-static v128_t unzigzag8(v128_t v)
+inline v128_t unzigzag8(v128_t v)
{
v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
v128_t xr = wasm_u8x16_shr(v, 1);
return wasm_v128_xor(xl, xr);
}
+
+SIMD_TARGET
+inline v128_t unzigzag16(v128_t v)
+{
+ v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
+ v128_t xr = wasm_u16x8_shr(v, 1);
+
+ return wasm_v128_xor(xl, xr);
+}
+
+SIMD_TARGET
+inline v128_t rotate32(v128_t v, int r)
+{
+ return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
+}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
SIMD_TARGET
-static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
{
assert(buffer_size % kByteGroupSize == 0);
assert(kByteGroupSize == 16);
- const unsigned char* header = data;
-
// round number of groups to 4 to get number of header bytes
size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
if (size_t(data_end - data) < header_size)
return NULL;
+ const unsigned char* header = data;
data += header_size;
size_t i = 0;
- // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+ // fast-path: process 4 groups at a time, do a shared bounds check
for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
{
size_t header_offset = i / kByteGroupSize;
unsigned char header_byte = header[header_offset / 4];
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
- data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
}
// slow-path: process remaining groups
@@ -963,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
return NULL;
size_t header_offset = i / kByteGroupSize;
+ unsigned char header_byte = header[header_offset / 4];
- int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
- data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+ data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
}
return data;
}
+template
+SIMD_TARGET static void
+decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
+{
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
+#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
+#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
+#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
+#endif
+
+#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
+
+ PREP();
+
+ unsigned char* savep = transposed;
+
+ for (size_t j = 0; j < vertex_count_aligned; j += 16)
+ {
+ LOAD(0);
+ LOAD(1);
+ LOAD(2);
+ LOAD(3);
+
+ transpose8(r0, r1, r2, r3);
+
+ TEMP t0, t1, t2, t3;
+ TEMP npi = pi;
+
+ UNZR(0);
+ GRP4(0);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(1);
+ GRP4(1);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(2);
+ GRP4(2);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ UNZR(3);
+ GRP4(3);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
+ // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
+ pi = rebase(npi, r0, r1, r2, r3);
+#else
+ (void)npi;
+#endif
+
+#undef UNZR
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+ }
+}
+
SIMD_TARGET
-static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
{
assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
@@ -982,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+ size_t control_size = version == 0 ? 0 : vertex_size / 4;
+ if (size_t(data_end - data) < control_size)
+ return NULL;
+
+ const unsigned char* control = data;
+ data += control_size;
+
for (size_t k = 0; k < vertex_size; k += 4)
{
+ unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
for (size_t j = 0; j < 4; ++j)
{
- data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
- if (!data)
- return NULL;
+ int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+ if (ctrl == 3)
+ {
+ // literal encoding; safe to over-copy due to tail
+ if (size_t(data_end - data) < vertex_count_aligned)
+ return NULL;
+
+ memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
+ data += vertex_count;
+ }
+ else if (ctrl == 2)
+ {
+ // zero encoding
+ memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
+ }
+ else
+ {
+ // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
+ int hshift = version == 0 ? 0 : 4 + ctrl;
+
+ data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
+ if (!data)
+ return NULL;
+ }
}
-#if defined(SIMD_SSE) || defined(SIMD_AVX)
-#define TEMP __m128i
-#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast