Merge branch 'master' into Improve-HighlightedPopUpColor

This commit is contained in:
Phantom
2026-01-12 22:56:49 +01:00
112 changed files with 7452 additions and 2382 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0f34bf867df5f4296ca66ac691c2bca4efa168fb9e21ca4e613e8086669575cf
size 13296
oid sha256:615dff65b01507be6c4de722e126324aba20fc197f8e12dafaa94a05e46cba6e
size 13222

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:064f54786958f109222c49cbc0358ff4f345b30010fcd5e8cc1fab7bdc68c4fe
size 13349
oid sha256:1f07ebb16820897e8598ae7a0627cb75b3d28e9dceea3ad4bd9ff543d5cdd01c
size 13979

View File

@@ -4,7 +4,7 @@
"Major": 1,
"Minor": 11,
"Revision": 0,
"Build": 6805
"Build": 6806
},
"Company": "Flax",
"Copyright": "Copyright (c) 2012-2025 Wojciech Figat. All rights reserved.",

View File

@@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated
settingsButton.Tag = script;
settingsButton.Clicked += OnSettingsButtonClicked;
group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2);
// Adjust margin to not overlap with other ui elements in the header
group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin };
group.Object(values, editor);
// Remove drop down arrows and containment lines if no objects in the group
if (group.Children.Count == 0)

View File

@@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors
protected bool NotNullItems;
private IntValueBox _sizeBox;
private Label _label;
private Color _background;
private int _elementsCount, _minCount, _maxCount;
private bool _readOnly;
@@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors
Parent = dropPanel,
};
var label = new Label
_label = new Label
{
Text = "Size",
AnchorPreset = AnchorPresets.TopRight,
@@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors
Resize(Count + 1);
};
}
}
Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged;
}
private void OnSetupContextMenu(ContextMenu menu, DropPanel panel)
{
if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal)))
@@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors
});
}
private void OnLayoutSizeChanged(Control control)
{
if (Layout.ContainerControl is DropPanel dropPanel)
{
// Hide "Size" text when array editor title overlaps
var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText);
if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left)
_label.TextColor = _label.TextColorHighlighted = Color.Transparent;
else
_label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground;
}
}
/// <inheritdoc />
protected override void Deinitialize()
{
_sizeBox = null;
Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged;
base.Deinitialize();
}

View File

@@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements
{
var style = Style.Current;
var settingsButtonSize = Panel.HeaderHeight;
return new Image
Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin };
; return new Image
{
TooltipText = "Settings",
AutoFocus = true,

View File

@@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input
/// </summary>
public event Action SlidingEnd;
/// <summary>
/// If enabled, pressing the arrow up or down key increments/ decrements the value.
/// </summary>
public bool ArrowKeysIncrement = true;
/// <summary>
/// Gets or sets the slider speed. Use value 0 to disable and hide slider UI.
/// </summary>
@@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input
ResetViewOffset();
}
/// <inheritdoc />
public override bool OnKeyDown(KeyboardKeys key)
{
if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown))
{
bool altDown = Root.GetKey(KeyboardKeys.Alt);
bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
bool controlDown = Root.GetKey(KeyboardKeys.Control);
float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f));
float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
_startSlideValue = Value;
ApplySliding(slideDelta);
EndSliding();
Focus();
return true;
}
return base.OnKeyDown(key);
}
/// <inheritdoc />
public override bool OnMouseDown(Float2 location, MouseButton button)
{

View File

@@ -125,6 +125,7 @@ namespace FlaxEditor.Modules
private ContextMenuButton _menuToolsProfilerWindow;
private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault;
private ContextMenuButton _menuToolsTakeScreenshot;
private ContextMenuButton _menuToolsOpenLocalFolder;
private ContextMenuChildMenu _menuWindowApplyWindowLayout;
private ToolStripButton _toolStripSaveAll;
@@ -725,6 +726,16 @@ namespace FlaxEditor.Modules
_menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot);
cm.AddSeparator();
cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
cm.AddSeparator();
var childMenu = cm.AddChildMenu("Open Product Local folder");
childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
_menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () =>
{
string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
GameSettings settings = GameSettings.Load<GameSettings>();
string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
FileSystem.ShowFileExplorer(path);
});
// Window
MenuWindow = MainMenu.AddButton("Window");
@@ -1062,6 +1073,10 @@ namespace FlaxEditor.Modules
_menuToolsBuildNavMesh.Enabled = canEdit;
_menuToolsCancelBuilding.Enabled = GameCooker.IsRunning;
_menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0;
string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
GameSettings settings = GameSettings.Load<GameSettings>();
string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
_menuToolsOpenLocalFolder.Enabled = Directory.Exists(path);
c.PerformLayout();
}

View File

@@ -571,6 +571,10 @@ namespace FlaxEditor.Options
[EditorDisplay("View Flags"), EditorOrder(3260)]
public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift);
[DefaultValue(typeof(InputBinding), "None")]
[EditorDisplay("View Flags"), EditorOrder(3270)]
public InputBinding Particles = new InputBinding(KeyboardKeys.None);
#endregion
#region Interface

View File

@@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors
if (value is BoxCollider collider)
collider.AutoResize(!_keepLocalOrientation);
}
Presenter.OnModified();
}
}

View File

@@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities
/// <returns>The result value.</returns>
public static double Parse(string text)
{
// Hack to allow parsing numbers while using "_" as a separator (like this: 1_000)
text = text.Replace("_", string.Empty);
var tokens = Tokenize(text);
var rpn = OrderTokens(tokens);
return EvaluateRPN(rpn);

View File

@@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport
InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog);
InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight);
InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals);
InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles);
InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess);
InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom);
InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping);
@@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport
new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog),
new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight),
new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals),
new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles),
new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess),
new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom),
new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping),
@@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport
if (cm.Visible == false)
return;
var ccm = (ContextMenu)cm;
var flags = Task.View.Flags;
foreach (var e in ccm.Items)
{
if (e is ContextMenuButton b && b.Tag != null)
{
var v = (ViewFlags)b.Tag;
b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
}
}
}

View File

@@ -45,7 +45,7 @@ namespace FlaxEditor.Windows
{
Parent = this
};
_saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save");
_saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save.");
_saveButton.Enabled = false;
_tabs = new Tabs
@@ -104,6 +104,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = true;
_isDataDirty = true;
if (!Title.EndsWith('*'))
Title += "*";
}
}
@@ -113,6 +115,8 @@ namespace FlaxEditor.Windows
{
_saveButton.Enabled = false;
_isDataDirty = false;
if (Title.EndsWith('*'))
Title = Title.Remove(Title.Length - 1);
}
}

View File

@@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const
return false;
}
#if USE_EDITOR
void Material::GetReferences(Array<Guid>& assets, Array<String>& files) const
{
ShaderAssetTypeBase<MaterialBase>::GetReferences(assets, files);
// Collect references from material graph (needs to load it)
if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE))
{
ScopeLock lock(Locker);
if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE)))
{
const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE);
if (surfaceChunk)
{
MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size());
MaterialGraph graph;
if (!graph.Load(&stream, false))
{
graph.GetReferences(assets);
}
}
}
}
}
#endif
const MaterialInfo& Material::GetInfo() const
{
if (_materialShader)

View File

@@ -38,6 +38,9 @@ public:
public:
// [MaterialBase]
bool IsMaterialInstance() const override;
#if USE_EDITOR
void GetReferences(Array<Guid>& assets, Array<String>& files) const override;
#endif
// [IMaterial]
const MaterialInfo& GetInfo() const override;

View File

@@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de
return list->Get() + startIndex;
}
FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest)
{
Array<DebugTriangle>* list;
if (depthTest)
list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles;
else
list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles;
const int32 startIndex = list->Count();
list->AddUninitialized(count);
return list->Get() + startIndex;
}
inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer)
{
Matrix w, fw, m;
@@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Color& col
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Float3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Span<int32
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
const Float3 origin = Context->Origin;
for (int32 i = 0; i < indices.Length();)
{
@@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Color& co
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
const Double3 origin = Context->Origin;
for (int32 i = 0; i < vertices.Length();)
{
@@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Span<int3
DebugTriangle t;
t.Color = Color32(color);
t.TimeLeft = duration;
auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
const Double3 origin = Context->Origin;
for (int32 i = 0; i < indices.Length();)
{

View File

@@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64
/// </summary>
LightsDebug = 1 << 27,
/// <summary>
/// Shows/hides particle effects.
/// </summary>
Particles = 1 << 28,
/// <summary>
/// Default flags for Game.
/// </summary>
DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky,
DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles,
/// <summary>
/// Default flags for Editor.
/// </summary>
DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky,
DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles,
/// <summary>
/// Default flags for materials/models previews generating.
/// </summary>
DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky,
DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles,
};
DECLARE_ENUM_OPERATORS(ViewFlags);

View File

@@ -10,7 +10,7 @@
/// <summary>
/// Current materials shader version.
/// </summary>
#define MATERIAL_GRAPH_VERSION 178
#define MATERIAL_GRAPH_VERSION 179
class Material;
class GPUShader;

View File

@@ -191,7 +191,7 @@ bool GlobalIlluminationFeature::Bind(MaterialShader::BindParameters& params, Spa
{
// Unbind SRVs to prevent issues
data.DDGI.CascadesCount = 0;
data.DDGI.FallbackIrradiance = Float3::Zero;
data.DDGI.FallbackIrradiance = Float4::Zero;
params.GPUContext->UnBindSR(srv + 0);
params.GPUContext->UnBindSR(srv + 1);
params.GPUContext->UnBindSR(srv + 2);

View File

@@ -17,7 +17,7 @@ public:
/// <summary>
/// Mesh data stream.
/// </summary>
struct Stream
struct FLAXENGINE_API Stream
{
friend MeshAccessor;

View File

@@ -378,7 +378,7 @@ API_STRUCT() struct FLAXENGINE_API GlobalIlluminationSettings : ISerializable
/// The irradiance lighting outside the GI range used as a fallback to prevent pure-black scene outside the Global Illumination range.
/// </summary>
API_FIELD(Attributes="EditorOrder(40), PostProcessSetting((int)GlobalIlluminationSettingsOverride.FallbackIrradiance)")
Color FallbackIrradiance = Color::Black;
Color FallbackIrradiance = Color::Transparent;
public:
/// <summary>

View File

@@ -620,6 +620,40 @@ void RenderTools::ComputeSphereModelDrawMatrix(const RenderView& view, const Flo
resultIsViewInside = Float3::DistanceSquared(view.Position, position) < Math::Square(radius * 1.1f); // Manually tweaked bias
}
Float3 RenderTools::GetColorQuantizationError(PixelFormat format)
{
Float3 mantissaBits;
switch (format)
{
case PixelFormat::R11G11B10_Float:
mantissaBits = Float3(6, 6, 5);
break;
case PixelFormat::R10G10B10A2_UNorm:
mantissaBits = Float3(10, 10, 10);
break;
case PixelFormat::R16G16B16A16_Float:
mantissaBits = Float3(16, 16, 16);
break;
case PixelFormat::R32G32B32A32_Float:
mantissaBits = Float3(23, 23, 23);
break;
case PixelFormat::R9G9B9E5_SharedExp:
mantissaBits = Float3(5, 6, 5);
break;
case PixelFormat::R8G8B8A8_UNorm:
case PixelFormat::B8G8R8A8_UNorm:
mantissaBits = Float3(8, 8, 8);
break;
default:
return Float3::Zero;
}
return {
Math::Pow(0.5f, mantissaBits.X),
Math::Pow(0.5f, mantissaBits.Y),
Math::Pow(0.5f, mantissaBits.Z)
};
}
int32 MipLevelsCount(int32 width)
{
int32 result = 1;

View File

@@ -140,6 +140,9 @@ public:
static void CalculateTangentFrame(Float3& resultNormal, Float4& resultTangent, const Float3& normal, const Float3& tangent);
static void ComputeSphereModelDrawMatrix(const RenderView& view, const Float3& position, float radius, Matrix& resultWorld, bool& resultIsViewInside);
// Calculates error for a given render target format to reduce floating-point precision artifacts via QuantizeColor (from Noise.hlsl).
static Float3 GetColorQuantizationError(PixelFormat format);
};
// Calculates mip levels count for a texture 1D.

View File

@@ -80,6 +80,8 @@ Delegate<const Float2&, MouseButton> Input::MouseDoubleClick;
Delegate<const Float2&, float> Input::MouseWheel;
Delegate<const Float2&> Input::MouseMove;
Action Input::MouseLeave;
Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonDown;
Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonUp;
Delegate<const Float2&, int32> Input::TouchDown;
Delegate<const Float2&, int32> Input::TouchMove;
Delegate<const Float2&, int32> Input::TouchUp;
@@ -1027,6 +1029,19 @@ void InputService::Update()
break;
}
}
// TODO: route gamepad button events into global InputEvents queue to improve processing
for (int32 i = 0; i < Input::Gamepads.Count(); i++)
{
auto gamepad = Input::Gamepads[i];
for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++)
{
GamepadButton button = (GamepadButton)buttonIdx;
if (gamepad->GetButtonDown(button))
Input::GamepadButtonDown((InputGamepadIndex)i, button);
else if (gamepad->GetButtonUp(button))
Input::GamepadButtonUp((InputGamepadIndex)i, button);
}
}
// Update all actions
for (int32 i = 0; i < Input::ActionMappings.Count(); i++)

View File

@@ -113,6 +113,16 @@ public:
/// </summary>
API_EVENT() static Action MouseLeave;
/// <summary>
/// Event fired when gamepad button goes down.
/// </summary>
API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonDown;
/// <summary>
/// Event fired when gamepad button goes up.
/// </summary>
API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonUp;
/// <summary>
/// Event fired when touch action begins.
/// </summary>

View File

@@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const
void ParticleEffect::Draw(RenderContext& renderContext)
{
if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas)
if (renderContext.View.Pass == DrawPass::GlobalSDF ||
renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas ||
EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles))
return;
_lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition));
RenderContextBatch renderContextBatch(renderContext);
@@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext)
void ParticleEffect::Draw(RenderContextBatch& renderContextBatch)
{
const RenderView& mainView = renderContextBatch.GetMainContext().View;
if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles))
return;
Particles::DrawParticles(renderContextBatch, this);
// Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections
const RenderView& mainView = renderContextBatch.GetMainContext().View;
const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius);
if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds))
return;

View File

@@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value)
void BoxCollider::AutoResize(bool globalOrientation = true)
{
Actor* parent = GetParent();
if (Cast<Scene>(parent))
if (parent == nullptr || Cast<Scene>(parent))
return;
// Get bounds of all siblings (excluding itself)
const Vector3 parentScale = parent->GetScale();
if (parentScale.IsAnyZero())
return; // Avoid division by zero
return;
// Hacky way to get unrotated bounded box of parent.
// Hacky way to get unrotated bounded box of parent
const Quaternion parentOrientation = parent->GetOrientation();
parent->SetOrientation(Quaternion::Identity);
BoundingBox parentBox = parent->GetBox();

View File

@@ -543,7 +543,6 @@ void WindowsPlatform::ReleaseMutex()
}
}
PRAGMA_DISABLE_OPTIMIZATION;
void CheckInstructionSet()
{
#if PLATFORM_ARCH_X86 || PLATFORM_ARCH_X64

View File

@@ -37,8 +37,45 @@ GPU_CB_STRUCT(Data {
Float3 Dummy;
float LutWeight;
void Init(const PostProcessSettings& settings, GPUTexture*& lut)
{
Dummy = Float2::Zero;
auto& toneMapping = settings.ToneMapping;
auto& colorGrading = settings.ColorGrading;
// White Balance
WhiteTemp = toneMapping.WhiteTemperature;
WhiteTint = toneMapping.WhiteTint;
// Shadows
ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
// Midtones
ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
// Highlights
ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
//
Texture* lutTexture = colorGrading.LutTexture.Get();
const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
lut = useLut ? lutTexture->GetTexture() : nullptr;
}
});
Data DefaultData;
// Custom render buffer for caching Color Grading LUT.
class ColorGradingCustomBuffer : public RenderBuffers::CustomBuffer
{
@@ -46,7 +83,7 @@ public:
GPUTexture* LUT = nullptr;
Data CachedData;
ToneMappingMode Mode = ToneMappingMode::None;
Texture* LutTexture = nullptr;
GPUTexture* LutTexture = nullptr;
#if COMPILE_WITH_DEV_ENV
uint64 FrameRendered = 0;
#endif
@@ -82,6 +119,9 @@ bool ColorGradingPass::Init()
#if COMPILE_WITH_DEV_ENV
_shader.Get()->OnReloading.Bind<ColorGradingPass, &ColorGradingPass::OnShaderReloading>(this);
#endif
PostProcessSettings defaultSettings;
GPUTexture* defaultLut;
DefaultData.Init(defaultSettings, defaultLut);
return false;
}
@@ -125,6 +165,18 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
{
PROFILE_CPU();
// Prepare the parameters
Data data;
GPUTexture* lutTexture;
auto& toneMapping = renderContext.List->Settings.ToneMapping;
data.Init(renderContext.List->Settings, lutTexture);
// Skip if color grading is unsued
if (Platform::MemoryCompare(&DefaultData, &data, sizeof(Data)) == 0 &&
lutTexture == nullptr &&
toneMapping.Mode == ToneMappingMode::None)
return nullptr;
// Check if can use volume texture (3D) for a LUT (faster on modern platforms, requires geometry shader)
const auto device = GPUDevice::Instance;
bool use3D = GPU_ALLOW_GEOMETRY_SHADERS && Graphics::PostProcessing::ColorGradingVolumeLUT;
@@ -172,41 +224,8 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
RENDER_TARGET_POOL_SET_NAME(colorGradingBuffer.LUT, "ColorGrading.LUT");
}
// Prepare the parameters
Data data;
data.Dummy = Float2::Zero;
auto& toneMapping = renderContext.List->Settings.ToneMapping;
auto& colorGrading = renderContext.List->Settings.ColorGrading;
// White Balance
data.WhiteTemp = toneMapping.WhiteTemperature;
data.WhiteTint = toneMapping.WhiteTint;
// Shadows
data.ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
data.ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
data.ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
data.ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
data.ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
data.ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
// Midtones
data.ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
data.ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
data.ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
data.ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
data.ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
// Highlights
data.ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
data.ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
data.ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
data.ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
data.ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
data.ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
//
Texture* lutTexture = colorGrading.LutTexture.Get();
const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
data.LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
// Check if LUT parameter hasn't been changed since the last time
if (Platform::MemoryCompare(&colorGradingBuffer.CachedData , &data, sizeof(Data)) == 0 &&
if (Platform::MemoryCompare(&colorGradingBuffer.CachedData, &data, sizeof(Data)) == 0 &&
colorGradingBuffer.Mode == toneMapping.Mode &&
#if COMPILE_WITH_DEV_ENV
colorGradingBuffer.FrameRendered > _reloadedFrame &&
@@ -232,7 +251,7 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
context->BindCB(0, cb);
context->SetViewportAndScissors((float)lutDesc.Width, (float)lutDesc.Height);
context->SetState(_psLut.Get((int32)toneMapping.Mode));
context->BindSR(0, useLut ? lutTexture->GetTexture() : nullptr);
context->BindSR(0, lutTexture);
#if GPU_ALLOW_GEOMETRY_SHADERS
if (use3D)
{

View File

@@ -11,6 +11,7 @@
#include "Engine/Core/Math/Quaternion.h"
#include "Engine/Core/Config/GraphicsSettings.h"
#include "Engine/Engine/Engine.h"
#include "Engine/Engine/Units.h"
#include "Engine/Content/Content.h"
#include "Engine/Debug/DebugDraw.h"
#include "Engine/Graphics/GPUContext.h"
@@ -41,6 +42,7 @@
#define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
#define DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE 8
#define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
#define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
#define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging
@@ -68,11 +70,14 @@ GPU_CB_STRUCT(Data0 {
Int4 ProbeScrollClears[4];
Float3 ViewDir;
float Padding1;
Float3 QuantizationError;
int32 FrameIndexMod8;
});
GPU_CB_STRUCT(Data1 {
// TODO: use push constants on Vulkan or root signature data on DX12 to reduce overhead of changing single DWORD
Float2 Padding2;
float Padding2;
int32 StepSize;
uint32 CascadeIndex;
uint32 ProbeIndexOffset;
});
@@ -214,6 +219,7 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
return true;
_csClassify = shader->GetCS("CS_Classify");
_csUpdateProbesInitArgs = shader->GetCS("CS_UpdateProbesInitArgs");
_csUpdateInactiveProbes = shader->GetCS("CS_UpdateInactiveProbes");
_csTraceRays[0] = shader->GetCS("CS_TraceRays", 0);
_csTraceRays[1] = shader->GetCS("CS_TraceRays", 1);
_csTraceRays[2] = shader->GetCS("CS_TraceRays", 2);
@@ -245,6 +251,7 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
LastFrameShaderReload = Engine::FrameCount;
_csClassify = nullptr;
_csUpdateProbesInitArgs = nullptr;
_csUpdateInactiveProbes = nullptr;
_csTraceRays[0] = nullptr;
_csTraceRays[1] = nullptr;
_csTraceRays[2] = nullptr;
@@ -322,7 +329,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
const float indirectLightingIntensity = settings.Intensity;
const float probeHistoryWeight = Math::Clamp(settings.TemporalResponse, 0.0f, 0.98f);
const float distance = settings.Distance;
const Color fallbackIrradiance = settings.FallbackIrradiance;
// Automatically calculate amount of cascades to cover the GI distance at the current probes spacing
const int32 idealProbesCount = 20; // Ideal amount of probes per-cascade to try to fit in order to cover whole distance
@@ -335,7 +341,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
}
// Calculate the probes count based on the amount of cascades and the distance to cover
const float cascadesDistanceScales[] = { 1.0f, 3.0f, 6.0f, 10.0f }; // Scales each cascade further away from the camera origin
const float cascadesDistanceScales[] = { 1.0f, 3.0f, 5.0f, 10.0f }; // Scales each cascade further away from the camera origin
const float distanceExtent = distance / cascadesDistanceScales[cascadesCount - 1];
const float verticalRangeScale = 0.8f; // Scales the probes volume size at Y axis (horizontal aspect ratio makes the DDGI use less probes vertically to cover whole screen)
Int3 probesCounts(Float3::Ceil(Float3(distanceExtent, distanceExtent * verticalRangeScale, distanceExtent) / probesSpacing));
@@ -351,6 +357,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
// Initialize cascades
float probesSpacings[4];
Float3 viewOrigins[4];
Float3 blendOrigins[4];
for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++)
{
// Each cascade has higher spacing between probes
@@ -361,14 +368,15 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
// Calculate view origin for cascade by shifting it towards the view direction to account for better view frustum coverage
Float3 viewOrigin = renderContext.View.Position;
Float3 viewDirection = renderContext.View.Direction;
const Float3 probesDistance = Float3(probesCounts) * cascadeProbesSpacing;
const Float3 probesDistance = Float3(probesCounts - 1) * cascadeProbesSpacing;
const float probesDistanceMax = probesDistance.MaxValue();
const Float3 viewRayHit = CollisionsHelper::LineHitsBox(viewOrigin, viewOrigin + viewDirection * (probesDistanceMax * 2.0f), viewOrigin - probesDistance, viewOrigin + probesDistance);
const float viewOriginOffset = viewRayHit.Y * probesDistanceMax * 0.6f;
viewOrigin += viewDirection * viewOriginOffset;
//viewOrigin = Float3::Zero;
blendOrigins[cascadeIndex] = viewOrigin;
const float viewOriginSnapping = cascadeProbesSpacing;
viewOrigin = Float3::Floor(viewOrigin / viewOriginSnapping) * viewOriginSnapping;
//viewOrigin = Float3::Zero;
viewOrigins[cascadeIndex] = viewOrigin;
}
@@ -500,6 +508,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
{
auto& cascade = ddgiData.Cascades[cascadeIndex];
ddgiData.Result.Constants.ProbesOriginAndSpacing[cascadeIndex] = Float4(cascade.ProbesOrigin, cascade.ProbesSpacing);
ddgiData.Result.Constants.BlendOrigin[cascadeIndex] = Float4(blendOrigins[cascadeIndex], 0.0f);
ddgiData.Result.Constants.ProbesScrollOffsets[cascadeIndex] = Int4(cascade.ProbeScrollOffsets, 0);
}
ddgiData.Result.Constants.RayMaxDistance = distance;
@@ -508,7 +517,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
ddgiData.Result.Constants.ProbeHistoryWeight = probeHistoryWeight;
ddgiData.Result.Constants.IrradianceGamma = 1.5f;
ddgiData.Result.Constants.IndirectLightingIntensity = indirectLightingIntensity;
ddgiData.Result.Constants.FallbackIrradiance = fallbackIrradiance.ToFloat3() * fallbackIrradiance.A;
ddgiData.Result.Constants.FallbackIrradiance = settings.FallbackIrradiance.ToFloat4();
ddgiData.Result.ProbesData = ddgiData.ProbesData->View();
ddgiData.Result.ProbesDistance = ddgiData.ProbesDistance->View();
ddgiData.Result.ProbesIrradiance = ddgiData.ProbesIrradiance->View();
@@ -535,6 +544,8 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
data.TemporalTime = renderContext.List->Setup.UseTemporalAAJitter ? RenderTools::ComputeTemporalTime() : 0.0f;
data.ViewDir = renderContext.View.Direction;
data.SkyboxIntensity = renderContext.List->Sky ? renderContext.List->Sky->GetIndirectLightingIntensity() : 1.0f;
data.QuantizationError = RenderTools::GetColorQuantizationError(ddgiData.ProbesIrradiance->Format());
data.FrameIndexMod8 = (int32)(Engine::FrameCount % 8);
GBufferPass::SetInputs(renderContext.View, data.GBuffer);
context->UpdateCB(_cb0, &data);
context->BindCB(0, _cb0);
@@ -581,6 +592,23 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
context->ResetUA();
}
// For inactive probes, search nearby ones to find the closest valid for quick fallback when sampling irradiance
{
PROFILE_GPU_CPU_NAMED("Update Inactive Probes");
// TODO: this could run within GPUComputePass during Trace Rays or Update Probes to overlap compute works
context->BindUA(0, ddgiData.Result.ProbesData);
Data1 data;
data.CascadeIndex = cascadeIndex;
int32 iterations = Math::CeilToInt(Math::Log2((float)Math::Min(probesCounts.MaxValue(), DDGI_PROBE_EMPTY_AREA_DENSITY) + 1.0f));
for (int32 i = iterations - 1; i >= 0; i--)
{
data.StepSize = Math::FloorToInt(Math::Pow(2, (float)i) + 0.5f); // Jump Flood step size
context->UpdateCB(_cb1, &data);
context->Dispatch(_csUpdateInactiveProbes, threadGroupsX, 1, 1);
}
context->ResetUA();
}
// Update probes in batches so ProbesTrace texture can be smaller
uint32 arg = 0;
// TODO: use rays allocator to dispatch raytracing in packets (eg. 8 threads in a group instead of hardcoded limit)

View File

@@ -15,7 +15,8 @@ public:
// Constant buffer data for DDGI access on a GPU.
GPU_CB_STRUCT(ConstantsData {
Float4 ProbesOriginAndSpacing[4];
Int4 ProbesScrollOffsets[4];
Float4 BlendOrigin[4]; // w is unused
Int4 ProbesScrollOffsets[4]; // w is unused
uint32 ProbesCounts[3];
uint32 CascadesCount;
float IrradianceGamma;
@@ -24,8 +25,7 @@ public:
float IndirectLightingIntensity;
Float3 ViewPos;
uint32 RaysCount;
Float3 FallbackIrradiance;
float Padding0;
Float4 FallbackIrradiance;
});
// Binding data for the GPU.
@@ -44,6 +44,7 @@ private:
GPUConstantBuffer* _cb1 = nullptr;
GPUShaderProgramCS* _csClassify;
GPUShaderProgramCS* _csUpdateProbesInitArgs;
GPUShaderProgramCS* _csUpdateInactiveProbes;
GPUShaderProgramCS* _csTraceRays[4];
GPUShaderProgramCS* _csUpdateProbesIrradiance;
GPUShaderProgramCS* _csUpdateProbesDistance;

View File

@@ -428,6 +428,7 @@ public:
// Write to objects buffer (this must match unpacking logic in HLSL)
uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4);
ObjectsListBuffer.Write(objectAddress);
ObjectsBuffer.Data.EnsureCapacity(ObjectsBuffer.Data.Count() + sizeof(Float4) * (GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE + 6 * GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE));
auto* objectData = ObjectsBuffer.WriteReserve<Float4>(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE);
objectData[0] = Float4(object.Position, object.Radius);
objectData[1] = Float4::Zero;
@@ -511,6 +512,7 @@ public:
{
// Dirty object to redraw
object->LastFrameUpdated = 0;
return;
}
GlobalSurfaceAtlasLight* light = Lights.TryGet(a->GetID());
if (light)

View File

@@ -269,7 +269,7 @@ void PostProcessingPass::Render(RenderContext& renderContext, GPUTexture* input,
int32 bloomMipCount = CalculateBloomMipCount(w1, h1);
// Ensure to have valid data and if at least one effect should be applied
if (!(useBloom || useToneMapping || useCameraArtifacts) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
if (!(useBloom || useToneMapping || useCameraArtifacts || colorGradingLUT) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
{
// Resources are missing. Do not perform rendering. Just copy raw frame
context->SetViewportAndScissors((float)output->Width(), (float)output->Height());

View File

@@ -402,6 +402,8 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont
case ViewMode::MaterialComplexity:
case ViewMode::Wireframe:
case ViewMode::NoPostFx:
case ViewMode::VertexColors:
case ViewMode::QuadOverdraw:
setup.UseTemporalAAJitter = false;
break;
}

View File

@@ -137,8 +137,8 @@ namespace FlaxEngine
{
Debug.LogError($"Unhandled Exception: {exception.Message}");
Debug.LogException(exception);
if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
Platform.Fatal($"Unhandled Exception: {exception}");
//if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
// Platform.Fatal($"Unhandled Exception: {exception}");
}
}

View File

@@ -11,6 +11,11 @@ namespace FlaxEngine.GUI
[ActorToolbox("GUI")]
public class DropPanel : ContainerControl
{
/// <summary>
/// Size of the drop down icon.
/// </summary>
public const float DropDownIconSize = 14.0f;
/// <summary>
/// The header height.
/// </summary>
@@ -368,7 +373,7 @@ namespace FlaxEngine.GUI
var style = Style.Current;
var enabled = EnabledInHierarchy;
// Paint Background
// Draw Background
var backgroundColor = BackgroundColor;
if (backgroundColor.A > 0.0f)
{
@@ -386,7 +391,7 @@ namespace FlaxEngine.GUI
float textLeft = 0;
if (EnableDropDownIcon)
{
textLeft += 14;
textLeft += DropDownIconSize;
var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12);
var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey;
if (_isClosed)
@@ -395,7 +400,7 @@ namespace FlaxEngine.GUI
ArrowImageOpened?.Draw(dropDownRect, arrowColor);
}
// Text
// Header text
var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight);
_headerTextMargin.ShrinkRectangle(ref textRect);
var textColor = HeaderTextColor;
@@ -404,7 +409,9 @@ namespace FlaxEngine.GUI
textColor *= 0.6f;
}
Render2D.PushClip(textRect);
Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center);
Render2D.PopClip();
if (!_isClosed && EnableContainmentLines)
{

View File

@@ -20,17 +20,23 @@
#define DDGI_PROBE_ATTENTION_MAX 0.98f // Maximum probe attention value that still makes it active (but not activated which is 1.0f).
#define DDGI_PROBE_RESOLUTION_IRRADIANCE 6 // Resolution (in texels) for probe irradiance data (excluding 1px padding on each side)
#define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
#define DDGI_CASCADE_BLEND_SIZE 2.5f // Distance in probes over which cascades blending happens
#define DDGI_CASCADE_BLEND_SIZE 2.0f // Distance in probes over which cascades blending happens
#ifndef DDGI_CASCADE_BLEND_SMOOTH
#define DDGI_CASCADE_BLEND_SMOOTH 0 // Enables smooth cascade blending, otherwise dithering will be used
#endif
#define DDGI_SRGB_BLENDING 1 // Enables blending in sRGB color space, otherwise irradiance blending is done in linear space
#define DDGI_DEFAULT_BIAS 0.2f // Default value for DDGI sampling bias
#define DDGI_FALLBACK_COORDS_ENCODE(coord) ((float3)(coord + 1) / 128.0f)
#define DDGI_FALLBACK_COORDS_DECODE(data) (uint3)(data.xyz * 128.0f - 1)
#define DDGI_FALLBACK_COORDS_VALID(data) (length(data.xyz) > 0)
//#define DDGI_DEBUG_CASCADE 0 // Forces a specific cascade to be only in use (for debugging)
// DDGI data for a constant buffer
struct DDGIData
{
float4 ProbesOriginAndSpacing[4];
int4 ProbesScrollOffsets[4]; // w unused
float4 BlendOrigin[4]; // w is unused
int4 ProbesScrollOffsets[4]; // w is unused
uint3 ProbesCounts;
uint CascadesCount;
float IrradianceGamma;
@@ -39,8 +45,7 @@ struct DDGIData
float IndirectLightingIntensity;
float3 ViewPos;
uint RaysCount;
float3 FallbackIrradiance;
float Padding0;
float4 FallbackIrradiance;
};
uint GetDDGIProbeIndex(DDGIData data, uint3 probeCoords)
@@ -159,6 +164,8 @@ float2 GetDDGIProbeUV(DDGIData data, uint cascadeIndex, uint probeIndex, float2
float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, uint cascadeIndex, float3 probesOrigin, float3 probesExtent, float probesSpacing, float3 biasedWorldPosition)
{
bool invalidCascade = cascadeIndex >= data.CascadesCount;
cascadeIndex = min(cascadeIndex, data.CascadesCount - 1);
uint3 probeCoordsEnd = data.ProbesCounts - uint3(1, 1, 1);
uint3 baseProbeCoords = clamp(uint3((worldPosition - probesOrigin + probesExtent) / probesSpacing), uint3(0, 0, 0), probeCoordsEnd);
@@ -168,7 +175,6 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
// Loop over the closest probes to accumulate their contributions
float4 irradiance = float4(0, 0, 0, 0);
const int3 SearchAxisMasks[3] = { int3(1, 0, 0), int3(0, 1, 0), int3(0, 0, 1) };
for (uint i = 0; i < 8; i++)
{
uint3 probeCoordsOffset = uint3(i, i >> 1, i >> 2) & 1;
@@ -178,33 +184,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
// Load probe position and state
float4 probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
uint probeState = DecodeDDGIProbeState(probeData);
uint useVisibility = true;
float minWight = 0.000001f;
if (probeState == DDGI_PROBE_STATE_INACTIVE)
{
// Search nearby probes to find any nearby GI sample
for (int searchDistance = 1; searchDistance < 3 && probeState == DDGI_PROBE_STATE_INACTIVE; searchDistance++)
for (uint searchAxis = 0; searchAxis < 3; searchAxis++)
{
int searchAxisDir = probeCoordsOffset[searchAxis] ? 1 : -1;
int3 searchCoordsOffset = SearchAxisMasks[searchAxis] * searchAxisDir * searchDistance;
uint3 searchCoords = clamp((int3)probeCoords + searchCoordsOffset, int3(0, 0, 0), (int3)probeCoordsEnd);
uint searchIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, searchCoords);
float4 searchData = LoadDDGIProbeData(data, probesData, cascadeIndex, searchIndex);
uint searchState = DecodeDDGIProbeState(searchData);
if (searchState != DDGI_PROBE_STATE_INACTIVE)
{
// Use nearby probe as a fallback (visibility test might ignore it but with smooth gradient)
probeCoords = searchCoords;
probeIndex = searchIndex;
probeData = searchData;
probeState = searchState;
break;
}
}
if (probeState == DDGI_PROBE_STATE_INACTIVE)
continue;
// Use fallback probe that is closest to this one
uint3 fallbackCoords = DDGI_FALLBACK_COORDS_DECODE(probeData);
float fallbackToProbeDist = length((float3)probeCoords - (float3)fallbackCoords);
useVisibility = fallbackToProbeDist <= 1.0f; // Skip visibility test that blocks too far probes due to limiting max distance to 1.5 of probe spacing
if (fallbackToProbeDist > 2.0f) minWight = 1.0f;
probeCoords = fallbackCoords;
probeIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, fallbackCoords);
probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
//if (DecodeDDGIProbeState(probeData) == DDGI_PROBE_STATE_INACTIVE) continue;
}
float3 probeBasePosition = baseProbeWorldPosition + ((probeCoords - baseProbeCoords) * probesSpacing);
float3 probePosition = probeBasePosition + probeData.xyz * probesSpacing; // Probe offset is [-1;1] within probes spacing
// Calculate probe position
float3 probePosition = baseProbeWorldPosition + (((float3)probeCoords - (float3)baseProbeCoords) * probesSpacing) + probeData.xyz * probesSpacing;
// Calculate the distance and direction from the (biased and non-biased) shading point and the probe
float3 worldPosToProbe = normalize(probePosition - worldPosition);
@@ -213,6 +209,7 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
// Smooth backface test
float weight = Square(dot(worldPosToProbe, worldNormal) * 0.5f + 0.5f);
weight = max(weight, 0.1f);
// Sample distance texture
float2 octahedralCoords = GetOctahedralCoords(-biasedPosToProbe);
@@ -220,24 +217,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
float2 probeDistance = probesDistance.SampleLevel(SamplerLinearClamp, uv, 0).rg * 2.0f;
// Visibility weight (Chebyshev)
if (biasedPosToProbeDist > probeDistance.x)
if (biasedPosToProbeDist > probeDistance.x && useVisibility)
{
float variance = abs(Square(probeDistance.x) - probeDistance.y);
float visibilityWeight = variance / (variance + Square(biasedPosToProbeDist - probeDistance.x));
weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.05f);
weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.0f);
}
// Avoid a weight of zero
weight = max(weight, 0.000001f);
weight = max(weight, minWight);
// Adjust weight curve to inject a small portion of light
const float minWeightThreshold = 0.2f;
if (weight < minWeightThreshold)
weight *= Square(weight) / Square(minWeightThreshold);
if (weight < minWeightThreshold) weight *= (weight * weight) * (1.0f / (minWeightThreshold * minWeightThreshold));
// Calculate trilinear weights based on the distance to each probe to smoothly transition between grid of 8 probes
float3 trilinear = lerp(1.0f - biasAlpha, biasAlpha, (float3)probeCoordsOffset);
weight *= max(trilinear.x * trilinear.y * trilinear.z, 0.001f);
weight *= saturate(trilinear.x * trilinear.y * trilinear.z * 2.0f);
// Sample irradiance texture
octahedralCoords = GetOctahedralCoords(worldNormal);
@@ -269,7 +265,9 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
if (irradiance.a > 0.0f)
{
// Normalize irradiance
irradiance.rgb /= irradiance.a;
//irradiance.rgb /= irradiance.a;
//irradiance.rgb /= lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
irradiance.rgb /= invalidCascade ? irradiance.a : lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
#if DDGI_SRGB_BLENDING
irradiance.rgb *= irradiance.rgb;
#endif
@@ -281,22 +279,34 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
float3 GetDDGISurfaceBias(float3 viewDir, float probesSpacing, float3 worldNormal, float bias)
{
// Bias the world-space position to reduce artifacts
return (worldNormal * 0.2f + viewDir * 0.8f) * (0.75f * probesSpacing * bias);
return (worldNormal * 0.2f + viewDir * 0.8f) * (0.6f * probesSpacing * bias);
}
// [Inigo Quilez, https://iquilezles.org/articles/distfunctions/]
float sdRoundBox(float3 p, float3 b, float r)
{
float3 q = abs(p) - b + r;
return length(max(q, 0.0f)) + min(max(q.x, max(q.y, q.z)), 0.0f) - r;
}
// Samples DDGI probes volume at the given world-space position and returns the irradiance.
// bias - scales the bias vector to the initial sample point to reduce self-shading artifacts
// dither - randomized per-pixel value in range 0-1, used to smooth dithering for cascades blending
float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = 0.2f, float dither = 0.0f)
float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = DDGI_DEFAULT_BIAS, float dither = 0.0f)
{
// Select the highest cascade that contains the sample location
uint cascadeIndex = 0;
float probesSpacing = 0, cascadeWeight = 0;
float3 probesOrigin = (float3)0, probesExtent = (float3)0, biasedWorldPosition = (float3)0;
float3 viewDir = normalize(data.ViewPos - worldPosition);
#if DDGI_CASCADE_BLEND_SMOOTH
dither = 0.0f;
#endif
#ifdef DDGI_DEBUG_CASCADE
uint cascadeIndex = DDGI_DEBUG_CASCADE;
#else
uint cascadeIndex = 0;
if (data.CascadesCount == 0)
return float3(0, 0, 0);
for (; cascadeIndex < data.CascadesCount; cascadeIndex++)
{
// Get cascade data
@@ -306,26 +316,21 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, T
biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);
// Calculate cascade blending weight (use input bias to smooth transition)
float cascadeBlendSmooth = frac(max(distance(data.ViewPos, worldPosition) - probesExtent.x, 0) / probesSpacing) * 0.1f;
float3 cascadeBlendPoint = worldPosition - probesOrigin - cascadeBlendSmooth * probesSpacing;
float fadeDistance = probesSpacing * DDGI_CASCADE_BLEND_SIZE;
#if DDGI_CASCADE_BLEND_SMOOTH
fadeDistance *= 2.0f; // Make it even smoother when using linear blending
#endif
cascadeWeight = saturate(Min3(probesExtent - abs(cascadeBlendPoint)) / fadeDistance);
float3 blendPos = worldPosition - data.BlendOrigin[cascadeIndex].xyz;
cascadeWeight = sdRoundBox(blendPos, probesExtent - probesSpacing, probesSpacing * 2) + fadeDistance;
cascadeWeight = 1 - saturate(cascadeWeight / fadeDistance);
if (cascadeWeight > dither)
break;
}
if (cascadeIndex == data.CascadesCount)
return data.FallbackIrradiance;
#endif
// Sample cascade
float3 result = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);
// Blend with the next cascade (or fallback irradiance outside the volume)
#if DDGI_CASCADE_BLEND_SMOOTH && !defined(DDGI_DEBUG_CASCADE)
cascadeIndex++;
#if DDGI_CASCADE_BLEND_SMOOTH
result *= cascadeWeight;
if (cascadeIndex < data.CascadesCount && cascadeWeight < 0.99f)
{
probesSpacing = data.ProbesOriginAndSpacing[cascadeIndex].w;
@@ -333,18 +338,16 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, T
probesExtent = (data.ProbesCounts - 1) * (probesSpacing * 0.5f);
biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);
float3 resultNext = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);
result *= cascadeWeight;
result += resultNext * (1 - cascadeWeight);
}
else
{
result += data.FallbackIrradiance * (1 - cascadeWeight);
}
#else
if (cascadeIndex == data.CascadesCount)
{
result += data.FallbackIrradiance * (1 - cascadeWeight);
}
#endif
if (cascadeIndex >= data.CascadesCount)
{
// Blend between the last cascade and the fallback irradiance
float fallbackWeight = (1 - cascadeWeight) * data.FallbackIrradiance.a;
result = lerp(result, data.FallbackIrradiance.rgb, fallbackWeight);
}
return result;
}

View File

@@ -13,6 +13,7 @@
#include "./Flax/Math.hlsl"
#include "./Flax/Noise.hlsl"
#include "./Flax/Quaternion.hlsl"
#include "./Flax/MonteCarlo.hlsl"
#include "./Flax/GlobalSignDistanceField.hlsl"
#include "./Flax/GI/GlobalSurfaceAtlas.hlsl"
#include "./Flax/GI/DDGI.hlsl"
@@ -26,6 +27,7 @@
#define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
#define DDGI_PROBE_RELOCATE_ITERATIVE 1 // If true, probes relocation algorithm tries to move them in additive way, otherwise all nearby locations are checked to find the best position
#define DDGI_PROBE_RELOCATE_FIND_BEST 1 // If true, probes relocation algorithm tries to move to the best matching location within nearby area
#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
#define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
#define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging
@@ -42,10 +44,13 @@ float TemporalTime;
int4 ProbeScrollClears[4];
float3 ViewDir;
float Padding1;
float3 QuantizationError;
uint FrameIndexMod8;
META_CB_END
META_CB_BEGIN(1, Data1)
float2 Padding2;
float Padding2;
int StepSize;
uint CascadeIndex;
uint ProbeIndexOffset;
META_CB_END
@@ -98,6 +103,11 @@ float3 Remap(float3 value, float3 fromMin, float3 fromMax, float3 toMin, float3
return (value - fromMin) / (fromMax - fromMin) * (toMax - toMin) + toMin;
}
bool IsProbeAtBorder(uint3 probeCoords)
{
return min(probeCoords.x, min(probeCoords.y, probeCoords.z)) == 0 || probeCoords.x == DDGI.ProbesCounts.x - 1 || probeCoords.y == DDGI.ProbesCounts.y - 1 || probeCoords.z == DDGI.ProbesCounts.z - 1;
}
// Compute shader for updating probes state between active and inactive and performing probes relocation.
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
@@ -112,6 +122,14 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
float3 probeBasePosition = GetDDGIProbeWorldPosition(DDGI, CascadeIndex, probeCoords);
#ifdef DDGI_DEBUG_CASCADE
// Single cascade-only debugging
if (CascadeIndex != DDGI_DEBUG_CASCADE)
{
RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
return;
}
#else
// Disable probes that are is in the range of higher-quality cascade
if (CascadeIndex > 0)
{
@@ -119,15 +137,15 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float prevProbesSpacing = DDGI.ProbesOriginAndSpacing[prevCascade].w;
float3 prevProbesOrigin = DDGI.ProbesScrollOffsets[prevCascade].xyz * prevProbesSpacing + DDGI.ProbesOriginAndSpacing[prevCascade].xyz;
float3 prevProbesExtent = (DDGI.ProbesCounts - 1) * (prevProbesSpacing * 0.5f);
prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE); // Apply safe margin to allow probes on cascade edges
prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE) * 2; // Apply safe margin to allow probes on cascade edges
float prevCascadeWeight = Min3(prevProbesExtent - abs(probeBasePosition - prevProbesOrigin));
if (prevCascadeWeight > 0.1f)
{
// Disable probe
RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
return;
}
}
#endif
// Check if probe was scrolled
int3 probeScrollClears = ProbeScrollClears[CascadeIndex].xyz;
@@ -171,9 +189,29 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
float voxelLimit = GlobalSDF.CascadeVoxelSize[CascadeIndex] * 0.8f;
float distanceLimit = probesSpacing * ProbesDistanceLimits[CascadeIndex];
float relocateLimit = probesSpacing * ProbesRelocateLimits[CascadeIndex];
if (sdfDst > distanceLimit + length(probeOffset)) // Probe is too far from geometry (or deep inside)
#ifdef DDGI_PROBE_EMPTY_AREA_DENSITY
uint3 probeCoordsStable = GetDDGIProbeCoords(DDGI, probeIndex);
if (sdf > probesSpacing * DDGI.ProbesCounts.x * 0.3f
#if DDGI_PROBE_EMPTY_AREA_DENSITY > 1
&& (
// Low-density grid grid
(probeCoordsStable.x % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.y % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.z % DDGI_PROBE_EMPTY_AREA_DENSITY == 0)
// Edge probes at the last cascade (for good fallback irradiance outside the GI distance)
//|| (CascadeIndex + 1 == DDGI.CascadesCount && IsProbeAtBorder(probeCoords))
)
#endif
)
{
// Disable it
// Addd some fallback probes in empty areas to provide valid GI for nearby dynamic objects or transparency
probeOffset = float3(0, 0, 0);
probeState = wasScrolled || probeStateOld == DDGI_PROBE_STATE_INACTIVE ? DDGI_PROBE_STATE_ACTIVATED : DDGI_PROBE_STATE_ACTIVE;
probeAttention = DDGI_PROBE_ATTENTION_MIN;
}
else
#endif
if (sdfDst > distanceLimit + length(probeOffset))
{
// Probe is too far from geometry (or deep inside) so disable it
probeOffset = float3(0, 0, 0);
probeState = DDGI_PROBE_STATE_INACTIVE;
probeAttention = 0.0f;
@@ -194,6 +232,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
probeAttention = clamp(probeAttention, DDGI_PROBE_ATTENTION_MIN, DDGI_PROBE_ATTENTION_MAX);
// Relocate only if probe location is not good enough
BRANCH
if (sdf <= voxelLimit)
{
#if DDGI_PROBE_RELOCATE_ITERATIVE
@@ -265,6 +304,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
bool wasActivated = probeStateOld == DDGI_PROBE_STATE_INACTIVE;
bool wasRelocated = distance(probeOffset, probeOffsetOld) > 2.0f;
#if DDGI_PROBE_RELOCATE_FIND_BEST || DDGI_PROBE_RELOCATE_ITERATIVE
BRANCH
if (wasRelocated && !wasActivated)
{
// If probe was relocated but the previous location is visible from the new one, then don't re-activate it for smoother blend
@@ -323,6 +363,78 @@ void CS_UpdateProbesInitArgs()
#endif
#ifdef _CS_UpdateInactiveProbes
RWTexture2D<snorm float4> RWProbesData : register(u0);
void CheckNearbyProbe(inout uint3 fallbackCoords, inout uint probeState, inout float minDistance, uint3 probeCoords, int3 probeCoordsEnd, int3 offset)
{
uint3 nearbyCoords = (uint3)clamp(((int3)probeCoords + offset), int3(0, 0, 0), probeCoordsEnd);
uint nearbyIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, nearbyCoords);
float4 nearbyData = RWProbesData[GetDDGIProbeTexelCoords(DDGI, CascadeIndex, nearbyIndex)];
float nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
if (DecodeDDGIProbeState(nearbyData) != DDGI_PROBE_STATE_INACTIVE && nearbyDist < minDistance)
{
// Use nearby probe
fallbackCoords = nearbyCoords;
probeState = DDGI_PROBE_STATE_ACTIVE;
minDistance = nearbyDist;
return;
}
nearbyCoords = DDGI_FALLBACK_COORDS_DECODE(nearbyData);
nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
if (DDGI_FALLBACK_COORDS_VALID(nearbyData) && nearbyDist < minDistance)
{
// Use fallback probe
fallbackCoords = nearbyCoords;
probeState = DDGI_PROBE_STATE_ACTIVE;
minDistance = nearbyDist;
}
}
// Compute shader to store closest valid probe coords inside inactive probes data for quick fallback lookup when sampling irradiance.
// Uses Jump Flood algorithm.
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
void CS_UpdateInactiveProbes(uint3 DispatchThreadId : SV_DispatchThreadID)
{
uint probeIndex = min(DispatchThreadId.x, ProbesCount - 1);
uint3 fallbackCoords = uint3(1000, 1000, 1000);
// Load probe data for the current thread
uint3 probeCoords = GetDDGIProbeCoords(DDGI, probeIndex);
probeIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, probeCoords);
int2 probeDataCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex);
float4 probeData = RWProbesData[probeDataCoords];
uint probeState = DecodeDDGIProbeState(probeData);
BRANCH
if (probeState == DDGI_PROBE_STATE_INACTIVE)
{
// Find the closest active probe (Jump Flood)
int3 probeCoordsEnd = (int3)DDGI.ProbesCounts - int3(1, 1, 1);
float minDistance = 1e27f;
UNROLL for (int z = -1; z <= 1; z++)
UNROLL for (int y = -1; y <= 1; y++)
UNROLL for (int x = -1; x <= 1; x++)
{
int3 offset = int3(x, y, z) * StepSize;
CheckNearbyProbe(fallbackCoords, probeState, minDistance, probeCoords, probeCoordsEnd, offset);
}
}
// Ensure all threads (within dispatch) got proper data before writing back to the same memory
AllMemoryBarrierWithGroupSync();
// Write modified probe data back (remain inactive)
BRANCH
if (probeState != DDGI_PROBE_STATE_INACTIVE && DispatchThreadId.x < ProbesCount && fallbackCoords.x != 1000)
{
RWProbesData[probeDataCoords] = EncodeDDGIProbeData(DDGI_FALLBACK_COORDS_ENCODE(fallbackCoords), DDGI_PROBE_STATE_INACTIVE, 0.0f);
}
}
#endif
#ifdef _CS_TraceRays
RWTexture2D<float4> RWProbesTrace : register(u0);
@@ -392,6 +504,8 @@ void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)
// Add some bias to prevent self occlusion artifacts in Chebyshev due to Global SDF being very incorrect in small scale
radiance.w = max(radiance.w + GlobalSDF.CascadeVoxelSize[hit.HitCascade] * 0.5f, 0);
float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
radiance.w += probesSpacing * 0.05f;
}
}
else
@@ -639,7 +753,7 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
// Add distance (R), distance^2 (G) and weight (A)
float rayDistance = CachedProbesTraceDistance[rayIndex];
result += float4(rayDistance * rayWeight, (rayDistance * rayDistance) * rayWeight, 0.0f, rayWeight);
result += float4(rayDistance, rayDistance * rayDistance, 0.0f, 1.0f) * rayWeight;
#endif
}
@@ -700,13 +814,17 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
//result.rgb = previous + (irradianceDelta * 0.25f);
}
result = float4(lerp(result.rgb, previous.rgb, historyWeight), 1.0f);
// Apply quantization error to reduce yellowish artifacts due to R11G11B10 format
float noise = InterleavedGradientNoise(octahedralCoords, FrameIndexMod8);
result.rgb = QuantizeColor(result.rgb, noise, QuantizationError);
#else
result = float4(lerp(result.rg, previous.rg, historyWeight), 0.0f, 1.0f);
#endif
RWOutput[outputCoords] = result;
GroupMemoryBarrierWithGroupSync();
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
#if DDGI_PROBE_UPDATE_MODE == 0
@@ -786,10 +904,10 @@ void PS_IndirectLighting(Quad_VS2PS input, out float4 output : SV_Target0)
}
// Sample irradiance
float bias = 0.2f;
float dither = RandN2(input.TexCoord + TemporalTime).x;
float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, gBuffer.WorldPos, gBuffer.Normal, bias, dither);
float3 samplePos = gBuffer.WorldPos + gBuffer.Normal * (dither * 0.1f + 0.1f);
float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, samplePos, gBuffer.Normal, DDGI_DEFAULT_BIAS, dither);
// Calculate lighting
float3 diffuseColor = GetDiffuseColor(gBuffer);
float3 diffuse = Diffuse_Lambert(diffuseColor);

View File

@@ -328,7 +328,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
viewRay = normalize(viewRay - ViewWorldPos);
trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
trace.NeedsHitNormal = true;
GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);
float3 color;
@@ -337,7 +336,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
// Sample Global Surface Atlas at the hit location
float surfaceThreshold = GetGlobalSurfaceAtlasThreshold(GlobalSDF, hit);
color = SampleGlobalSurfaceAtlas(GlobalSurfaceAtlas, GlobalSurfaceAtlasChunks, GlobalSurfaceAtlasCulledObjects, GlobalSurfaceAtlasObjects, GlobalSurfaceAtlasDepth, GlobalSurfaceAtlasTex, hit.GetHitPosition(trace), -viewRay, surfaceThreshold).rgb;
//color = hit.HitNormal * 0.5f + 0.5f;
}
else
{

View File

@@ -32,17 +32,13 @@ struct GlobalSDFTrace
float MinDistance;
float3 WorldDirection;
float MaxDistance;
float StepScale;
bool NeedsHitNormal;
void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance, float stepScale = 1.0f)
void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance)
{
WorldPosition = worldPosition;
WorldDirection = worldDirection;
MinDistance = minDistance;
MaxDistance = maxDistance;
StepScale = stepScale;
NeedsHitNormal = false;
}
};
@@ -75,12 +71,23 @@ void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldP
textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
}
// Clamps Global SDF cascade UV to ensure it can be sued for gradient sampling (clamps first and last pixels).
void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldPosition, out float3 cascadeUV, out float3 textureUV, out float3 textureMipUV)
{
float4 cascadePosDistance = data.CascadePosDistance[cascade];
float3 posInCascade = worldPosition - cascadePosDistance.xyz;
float cascadeSize = cascadePosDistance.w * 2;
cascadeUV = saturate(posInCascade / cascadeSize + 0.5f);
textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
float halfTexelOffsetMip = (GLOBAL_SDF_RASTERIZE_MIP_FACTOR * 0.5f) / data.Resolution;
textureMipUV = textureUV + float3(halfTexelOffsetMip / (float)data.CascadesCount, halfTexelOffsetMip, halfTexelOffsetMip); // Mipmaps are offset by half texel to sample correctly
}
// Clamps Global SDF cascade UV to ensure it can be used for gradient sampling (clamps first and last pixels).
void ClampGlobalSDFTextureGradientUV(const GlobalSDFData data, uint cascade, float texelOffset, inout float3 textureUV)
{
float cascadeSizeUV = 1.0f / data.CascadesCount;
float cascadeUVStart = cascadeSizeUV * cascade + texelOffset;
float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 3;
float cascadeUVStart = cascadeSizeUV * cascade + texelOffset * 2;
float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 4;
textureUV.x = clamp(textureUV.x, cascadeUVStart, cascadeUVEnd);
}
@@ -144,13 +151,13 @@ float SampleGlobalSDF(const GlobalSDFData data, Texture3D<snorm float> tex, Text
startCascade = min(startCascade, data.CascadesCount - 1);
for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
{
float3 cascadeUV, textureUV;
GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
float3 cascadeUV, textureUV, textureMipUV;
GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
float voxelSize = data.CascadeVoxelSize[cascade];
float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0);
float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0);
if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
{
distance = distanceMip * maxDistanceMip;
@@ -208,13 +215,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D<snorm float>
startCascade = min(startCascade, data.CascadesCount - 1);
for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
{
float3 cascadeUV, textureUV;
GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
float3 cascadeUV, textureUV, textureMipUV;
GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
float voxelSize = data.CascadeVoxelSize[cascade];
float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip;
if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
{
float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
@@ -236,13 +243,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D<snorm float>
{
distance = distanceMip;
float texelOffset = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR / data.Resolution;
ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureMipUV);
float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x + texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x - texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y + texelOffset, textureMipUV.z), 0).x;
float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y - texelOffset, textureMipUV.z), 0).x;
float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z + texelOffset), 0).x;
float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z - texelOffset), 0).x;
gradient = float3(xp - xn, yp - yn, zp - zn) * maxDistanceMip;
}
break;
@@ -290,59 +297,32 @@ GlobalSDFHit RayTraceGlobalSDF(const GlobalSDFData data, Texture3D<snorm float>
float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
LOOP
for (; step < 250 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
for (; step < 100 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
{
float3 stepPosition = trace.WorldPosition + trace.WorldDirection * stepTime;
float stepScale = trace.StepScale;
// Sample SDF
float stepDistance, voxelSizeScale = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR;
float3 cascadeUV, textureUV;
GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV);
float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
if (distanceMip < chunkSize)
{
stepDistance = distanceMip;
float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
if (distanceTex < chunkMargin)
{
stepDistance = distanceTex;
voxelSizeScale = 1.0f;
stepScale *= 0.63f; // Perform smaller steps nearby geometry
}
}
else
{
// Assume no SDF nearby so perform a jump to the next chunk
stepDistance = chunkSize;
voxelSizeScale = 1.0f;
}
float stepDistance;
float3 cascadeUV, textureUV, textureMipUV;
GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV, textureMipUV);
stepDistance = min(mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip, chunkSize);
float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
FLATTEN
if (distanceTex < chunkMargin)
stepDistance = distanceTex;
// Detect surface hit
float minSurfaceThickness = voxelSizeScale * voxelExtent * saturate(stepTime / voxelSize);
float minSurfaceThickness = voxelExtent * saturate(stepTime / voxelSize);
if (stepDistance < minSurfaceThickness)
{
// Surface hit
hit.HitTime = max(stepTime + stepDistance - minSurfaceThickness, 0.0f);
hit.HitCascade = cascade;
hit.HitSDF = stepDistance;
if (trace.NeedsHitNormal)
{
// Calculate hit normal from SDF gradient
float texelOffset = 1.0f / data.Resolution;
ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
float xp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
float xn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
float yp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
float yn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
float zp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
float zn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
hit.HitNormal = normalize(float3(xp - xn, yp - yn, zp - zn));
}
}
// Move forward
stepTime += max(stepDistance * stepScale, voxelSize);
stepTime += max(stepDistance, voxelSize);
}
hit.StepsCount += step;
}

View File

@@ -311,26 +311,39 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
viewRay = normalize(viewRay - ViewWorldPos);
trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
trace.NeedsHitNormal = true;
GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);
// Debug draw
float3 color = saturate(hit.StepsCount / 80.0f).xxx;
if (!hit.IsHit())
color.rg *= 0.4f;
#if 0
else
{
float3 color = saturate(hit.StepsCount / 50.0f).xxx;
if (hit.IsHit())
{
#if 1
float3 hitPosition = hit.GetHitPosition(trace);
float hitSDF;
float3 hitNormal = SampleGlobalSDFGradient(GlobalSDF, GlobalSDFTex, GlobalSDFMip, hitPosition, hitSDF, hit.HitCascade);
#if 1
// Composite step count with SDF normals
//color.rgb *= saturate(normalize(hitNormal) * 0.5f + 0.7f) + 0.3f;
color = lerp(normalize(hitNormal) * 0.5f + 0.5f, 1 - color, saturate(hit.StepsCount / 80.0f));
#else
// Debug draw SDF normals
color.rgb = normalize(hit.HitNormal) * 0.5f + 0.5f;
}
#elif 1
color = normalize(hitNormal) * 0.5f + 0.5f;
#endif
#else
// Heatmap with step count
if (hit.StepsCount > 40)
color = float3(saturate(hit.StepsCount / 80.0f), 0, 0);
else if (hit.StepsCount > 20)
color = float3(saturate(hit.StepsCount / 40.0f).xx, 0);
else
color = float3(0, saturate(hit.StepsCount / 20.0f), 0);
#endif
}
else
{
// Composite with SDF normals
color.rgb *= saturate(normalize(hit.HitNormal) * 0.5f + 0.7f) + 0.1f;
// Bluish sky
color.rg *= 0.4f;
}
#endif
return float4(color, 1);
}

View File

@@ -54,6 +54,26 @@ float2 PerlinNoiseFade(float2 t)
return t * t * t * (t * (t * 6.0 - 15.0) + 10.0);
}
// "Next Generation Post Processing in Call of Duty: Advanced Warfare"
// http://advances.realtimerendering.com/s2014/index.html
float InterleavedGradientNoise(float2 uv, uint frameCount)
{
const float2 magicFrameScale = float2(47, 17) * 0.695;
uv += frameCount * magicFrameScale;
const float3 magic = float3(0.06711056, 0.00583715, 52.9829189);
return frac(magic.z * frac(dot(uv, magic.xy)));
}
// Removes error from the color to properly store it in lower precision formats (error = 2^(-mantissaBits))
float3 QuantizeColor(float3 color, float noise, float3 error)
{
float3 delta = color * error;
delta.x = asfloat(asuint(delta.x) & ~0x007fffff);
delta.y = asfloat(asuint(delta.y) & ~0x007fffff);
delta.z = asfloat(asuint(delta.z) & ~0x007fffff);
return color + delta * noise;
}
float rand2dTo1d(float2 value, float2 dotDir = float2(12.9898, 78.233))
{
// https://www.ronja-tutorials.com/post/024-white-noise/

View File

@@ -1,8 +1,17 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
#ifdef MESHOPTIMIZER_ALLOC_EXPORT
meshopt_Allocator::Storage& meshopt_Allocator::storage()
{
meshopt_Allocator::Storage::allocate = allocate;
meshopt_Allocator::Storage::deallocate = deallocate;
static Storage s = {::operator new, ::operator delete };
return s;
}
#endif
void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
{
meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
s.allocate = allocate;
s.deallocate = deallocate;
}

File diff suppressed because it is too large Load Diff

View File

@@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind
return result;
}
meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
{
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
meshopt_VertexFetchStatistics result = {};
unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
memset(vertex_visited, 0, vertex_count);
const size_t kCacheLine = 64;
const size_t kCacheSize = 128 * 1024;
// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
size_t cache[kCacheSize / kCacheLine] = {};
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
vertex_visited[index] = 1;
size_t start_address = index * vertex_size;
size_t end_address = start_address + vertex_size;
size_t start_tag = start_address / kCacheLine;
size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
assert(start_tag < end_tag);
for (size_t tag = start_tag; tag < end_tag; ++tag)
{
size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
// we store +1 since cache is filled with 0 by default
result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
cache[line] = tag + 1;
}
}
size_t unique_vertex_count = 0;
for (size_t i = 0; i < vertex_count; ++i)
unique_vertex_count += vertex_visited[i];
result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
return result;
}

View File

@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
const unsigned char kSequenceHeader = 0xd0;
static int gEncodeIndexVersion = 1;
const int kDecodeIndexVersion = 1;
typedef unsigned int VertexFifo[16];
typedef unsigned int EdgeFifo[16][2];
@@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
if (fer >= 0 && (fer >> 2) < 15)
{
// note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
const unsigned int* order = kTriangleIndexOrder[fer & 3];
unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
@@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
// note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
int fea = (a == next) ? (next++, 0) : 15;
int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
@@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
void meshopt_encodeIndexVersion(int version)
{
assert(unsigned(version) <= 1);
assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
meshopt::gEncodeIndexVersion = version;
}
int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
{
if (buffer_size < 1)
return -1;
unsigned char header = buffer[0];
if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
return -1;
int version = header & 0x0f;
if (version > meshopt::kDecodeIndexVersion)
return -1;
return version;
}
int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
@@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
return -1;
int version = buffer[0] & 0x0f;
if (version > 1)
if (version > kDecodeIndexVersion)
return -1;
EdgeFifo edgefifo;
@@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
// fifo reads are wrapped around 16 entry buffer
unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
unsigned int c = 0;
int fec = codetri & 15;
@@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
{
// fifo reads are wrapped around 16 entry buffer
unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
unsigned int c = (fec == 0) ? next : cf;
c = (fec == 0) ? next : cf;
int fec0 = fec == 0;
next += fec0;
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
unsigned int c = 0;
// fec - (fec ^ 3) decodes 13, 14 into -1, 1
// note that we need to update the last index since free indices are delta-encoded
last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
}
else
{
@@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
return -1;
int version = buffer[0] & 0x0f;
if (version > 1)
if (version > kDecodeIndexVersion)
return -1;
const unsigned char* data = buffer + 1;

View File

@@ -5,7 +5,9 @@
#include <string.h>
// This work is based on:
// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
namespace meshopt
{
@@ -85,6 +87,46 @@ struct VertexStreamHasher
}
};
struct VertexCustomHasher
{
const float* vertex_positions;
size_t vertex_stride_float;
int (*callback)(void*, unsigned int, unsigned int);
void* context;
size_t hash(unsigned int index) const
{
const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
unsigned int x = key[0], y = key[1], z = key[2];
// replace negative zero with zero
x = (x == 0x80000000) ? 0 : x;
y = (y == 0x80000000) ? 0 : y;
z = (z == 0x80000000) ? 0 : z;
// scramble bits to make sure that integer coordinates have entropy in lower bits
x ^= x >> 17;
y ^= y >> 17;
z ^= z >> 17;
// Optimized Spatial Hashing for Collision Detection of Deformable Objects
return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
}
bool equal(unsigned int lhs, unsigned int rhs) const
{
const float* lp = vertex_positions + lhs * vertex_stride_float;
const float* rp = vertex_positions + rhs * vertex_stride_float;
if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
return false;
return callback ? callback(context, lhs, rhs) : true;
}
};
struct EdgeHasher
{
const unsigned int* remap;
@@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
allocator.deallocate(vertex_table);
}
template <typename Hash>
static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
{
memset(remap, -1, vertex_count * sizeof(unsigned int));
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(index < vertex_count);
if (remap[index] != ~0u)
continue;
unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
if (*entry == ~0u)
{
*entry = index;
remap[index] = next_vertex++;
}
else
{
assert(remap[*entry] != ~0u);
remap[index] = remap[*entry];
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
}
template <size_t BlockSize>
static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
{
@@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex
}
}
template <typename Hash>
static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
{
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
if (remap[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
destination[i] = remap[index];
}
}
} // namespace meshopt
size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
memset(destination, -1, vertex_count * sizeof(unsigned int));
VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(index < vertex_count);
if (destination[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
{
*entry = index;
destination[index] = next_vertex++;
}
else
{
assert(destination[*entry] != ~0u);
destination[index] = destination[*entry];
}
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
}
meshopt_Allocator allocator;
memset(destination, -1, vertex_count * sizeof(unsigned int));
VertexStreamHasher hasher = {streams, stream_count};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
unsigned int next_vertex = 0;
size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
{
using namespace meshopt;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(index < vertex_count);
assert(indices || index_count == vertex_count);
assert(!indices || index_count % 3 == 0);
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
if (destination[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
meshopt_Allocator allocator;
VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};
if (*entry == ~0u)
{
*entry = index;
destination[index] = next_vertex++;
}
else
{
assert(destination[*entry] != ~0u);
destination[index] = destination[*entry];
}
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
@@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned
assert(vertex_size <= vertex_stride);
meshopt_Allocator allocator;
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
if (remap[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
destination[i] = remap[index];
}
generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
}
meshopt_Allocator allocator;
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
VertexStreamHasher hasher = {streams, stream_count};
generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
}
void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
for (size_t i = 0; i < vertex_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
if (remap[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
*entry = unsigned(i);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
destination[i] = remap[index];
destination[i] = *entry;
}
}
@@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
memcpy(destination + i * 4, patch, sizeof(patch));
}
}
size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
{
assert(index_count % 3 == 0);
meshopt_Allocator allocator;
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
// compute vertex valence; this is used to prioritize least used corner
// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
memset(valence, 0, vertex_count);
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
valence[index]++;
}
unsigned int reorder_offset = 0;
// assign provoking vertices; leave the rest for the next pass
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
// try to rotate triangle such that provoking vertex hasn't been seen before
// if multiple vertices are new, prioritize the one with least valence
// this reduces the risk that a future triangle will have all three vertices seen
unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
if (vb != ~0u && vb <= va && vb <= vc)
{
// abc -> bca
unsigned int t = a;
a = b, b = c, c = t;
}
else if (vc != ~0u && vc <= va && vc <= vb)
{
// abc -> cab
unsigned int t = c;
c = b, b = a, a = t;
}
unsigned int newidx = reorder_offset;
// now remap[a] = ~0u or all three vertices are old
// recording remap[a] makes it possible to remap future references to the same index, conserving space
if (remap[a] == ~0u)
remap[a] = newidx;
// we need to clone the provoking vertex to get a unique index
// if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
reorder[reorder_offset++] = a;
// note: first vertex is final, the other two will be fixed up in next pass
destination[i + 0] = newidx;
destination[i + 1] = b;
destination[i + 2] = c;
// update vertex valences for corner heuristic
valence[a]--;
valence[b]--;
valence[c]--;
}
// remap or clone non-provoking vertices (iterating to skip provoking vertices)
int step = 1;
for (size_t i = 1; i < index_count; i += step, step ^= 3)
{
unsigned int index = destination[i];
if (remap[index] == ~0u)
{
// we haven't seen the vertex before as a provoking vertex
// to maintain the reference to the original vertex we need to clone it
unsigned int newidx = reorder_offset;
remap[index] = newidx;
reorder[reorder_offset++] = index;
}
destination[i] = remap[index];
}
assert(reorder_offset <= vertex_count + index_count / 3);
return reorder_offset;
}

View File

@@ -1,7 +1,7 @@
/**
* meshoptimizer - version 0.21
* meshoptimizer - version 1.0
*
* Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at https://github.com/zeux/meshoptimizer
*
* This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
#include <stddef.h>
/* Version macro; major * 1000 + minor * 10 + patch */
#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */
/* If no API is defined, assume default */
#ifndef MESHOPTIMIZER_API
@@ -29,11 +29,14 @@
#endif
/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
#ifndef MESHOPTIMIZER_EXPERIMENTAL
#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
#endif
/* C interface */
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
/**
@@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
*/
MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
/**
* Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
* As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
* Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
* Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
*
* destination must contain enough space for the resulting remap table (vertex_count elements)
* indices can be NULL if the input is unindexed
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
*/
MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
/**
* Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
*
@@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
*/
MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
/**
* Generates a remap table that maps all vertices with the same position to the same (existing) index.
* Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
* This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
*
* destination must contain enough space for the resulting remap table (vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
* Each triangle is converted into a 6-vertex patch with the following layout:
@@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
*/
MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
* Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
* This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
* The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
* The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
* For maximum efficiency the input index buffer should be optimized for vertex cache first.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
*/
MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
/**
* Vertex transform cache optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
*/
@@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
* Vertex transform cache optimizer for FIFO caches
* Reorders indices to reduce the number of GPU vertex shader invocations
* Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* cache_size should be less than the actual GPU cache size to avoid cache thrashing
@@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
/**
* Overdraw optimizer
* Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
@@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
* Vertex fetch cache optimizer
* Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
* Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
* This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
* This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count elements)
* indices is used both as an input and as an output index buffer
@@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
/**
* Set index encoder format version
* Set index encoder format version (defaults to 1)
*
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
*/
MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
@@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
*/
MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
/**
* Get encoded index format version
* Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
* Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
*/
MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
/**
* Index sequence encoder
* Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
* Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
* This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
* Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
* For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
*
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
* vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
/**
* Set vertex encoder format version
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
* Vertex buffer encoder
* Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
* For compression level to take effect, the vertex encoding version must be set to 1.
* The default compression level implied by meshopt_encodeVertexBuffer is 2.
*
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
* vertex_size must be a multiple of 4 (and <= 256)
* level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
* version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
*/
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
/**
* Set vertex encoder format version (defaults to 1)
*
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
*/
MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
@@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
* The decoder is safe to use for untrusted input, but it may produce garbage data.
*
* destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
* vertex_size must be a multiple of 4 (and <= 256)
*/
MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
/**
* Get encoded vertex format version
* Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
* Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
*/
MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
/**
* Vertex buffer filters
* These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
*
* meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
* meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
*
* meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
* meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
* Each component is stored as an 16-bit integer; stride must be equal to 8.
*
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
*
* meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
*/
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
/**
* Vertex buffer filter encoders
* These functions can be used to encode data in a format that meshopt_decodeFilter can decode
*
* meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
* meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
* Input data must contain 4 floats for every vector (count*4 total).
*
* meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
@@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
* meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
*
* meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
* Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
* Input data must contain 4 floats for every color (count*4 total).
*/
enum meshopt_EncodeExpMode
{
@@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode
meshopt_EncodeExpSharedVector,
/* When encoding exponents, use shared value for each component of all vectors (best compression) */
meshopt_EncodeExpSharedComponent,
/* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
meshopt_EncodeExpClamped,
};
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
/**
* Simplification options
@@ -334,16 +416,34 @@ enum
meshopt_SimplifySparse = 1 << 1,
/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
meshopt_SimplifyErrorAbsolute = 1 << 2,
/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
meshopt_SimplifyPrune = 1 << 3,
/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
meshopt_SimplifyRegularize = 1 << 4,
/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
meshopt_SimplifyPermissive = 1 << 5,
};
/**
* Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
*/
enum
{
/* Do not move this vertex. */
meshopt_SimplifyVertex_Lock = 1 << 0,
/* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
meshopt_SimplifyVertex_Protect = 1 << 1,
};
/**
* Mesh simplifier
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
* If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
* If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
* Returns the number of indices after simplification, with destination containing new index data
*
* The resulting index buffer references vertices from the original vertex buffer.
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
* If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
@@ -354,45 +454,94 @@ enum
MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
* Experimental: Mesh simplifier with attribute metric
* The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
* Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
* Mesh simplifier with attribute metric
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
* Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
* If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
* Returns the number of indices after simplification, with destination containing new index data
*
* The resulting index buffer references vertices from the original vertex buffer.
* If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
* Note that the number of attributes with non-zero weights affects memory requirements and running time.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_attributes should have attribute_count floats for each vertex
* attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
* attribute_count must be <= 16
* attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
* attribute_count must be <= 32
* vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
* TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
* Experimental: Mesh simplifier (sloppy)
* Mesh simplifier with position/attribute update
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
* Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
* If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
* Returns the number of indices after simplification, indices are destructively updated with new index data
*
* The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
* Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
* Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
*
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_attributes should have attribute_count floats for each vertex
* attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
* attribute_count must be <= 32
* vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
/**
* Mesh simplifier (sloppy)
* Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
* The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
* Returns the number of indices after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
* If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
/**
* Experimental: Point cloud simplifier
* Mesh simplifier (pruner)
* Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
* Returns the number of indices after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
* If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
*/
MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
/**
* Point cloud simplifier
* Reduces the number of points in the cloud to reach the given target
* Returns the number of points after simplification, with destination containing new index data
* The resulting index buffer references vertices from the original vertex buffer.
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
* If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
*
* destination must contain enough space for the target index buffer (target_vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
* vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
* color_weight determines relative priority of color wrt position; 1.0 is a safe default
*/
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
/**
* Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics
*/
MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
struct meshopt_VertexFetchStatistics
{
unsigned int bytes_fetched;
float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
};
/**
* Vertex fetch cache analyzer
* Returns cache hit statistics using a simplified direct mapped model
* Results may not match actual GPU performance
*/
MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
struct meshopt_OverdrawStatistics
{
unsigned int pixels_covered;
@@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics
*/
MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
struct meshopt_VertexFetchStatistics
struct meshopt_CoverageStatistics
{
unsigned int bytes_fetched;
float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
float coverage[3];
float extent; /* viewport size in mesh coordinates */
};
/**
* Vertex fetch cache analyzer
* Returns cache hit statistics using a simplified direct mapped model
* Results may not match actual GPU performance
* Coverage analyzer
* Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
*
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Meshlet is a small mesh cluster (subset) that consists of:
* - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
* - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
*
* For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
*/
struct meshopt_Meshlet
{
/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
unsigned int vertex_offset;
unsigned int triangle_offset;
/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
/* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
unsigned int vertex_count;
unsigned int triangle_count;
};
@@ -484,14 +654,15 @@ struct meshopt_Meshlet
* Meshlet builder
* Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
* The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
* When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
* When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
* When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
*
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
* meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
* meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
* max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
*/
MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
/**
* Experimental: Meshlet optimizer
* Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
* Meshlet builder with flexible cluster sizes
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
* Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
*
* meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
* need to be computed from meshlet's vertex_offset and triangle_offset
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
* split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
*/
MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
/**
* Meshlet builder that produces clusters optimized for raytracing
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
*
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
* fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
*/
MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
/**
* Meshlet optimizer
* Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
*
* meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
*/
MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
struct meshopt_Bounds
{
@@ -544,11 +742,35 @@ struct meshopt_Bounds
* Real-Time Rendering 4th Edition, section 19.3).
*
* vertex_positions should have float3 position in the first 12 bytes of each vertex
* index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
* vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
* index_count/3 and triangle_count must not exceed implementation limits (<= 512)
*/
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Sphere bounds generator
* Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
*
* positions should have float3 position in the first 12 bytes of each element
* radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
*/
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
/**
* Cluster partitioner
* Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
* When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
*
* destination must contain enough space for the resulting partition data (cluster_count elements)
* destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
* cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
* cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
* vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
* target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
*/
MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
/**
* Spatial sorter
* Generates a remap table that can be used to reorder points for spatial locality.
@@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig
MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Experimental: Spatial sorter
* Spatial sorter
* Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
*
* destination must contain enough space for the resulting index buffer (index_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
/**
* Spatial clusterizer
* Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
* Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
*
* destination must contain enough space for the resulting index buffer (vertex_count elements)
* vertex_positions should have float3 position in the first 12 bytes of each vertex
*/
MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
/**
* Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
* Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
* Representable magnitude range: [6e-5; 65504]
* Maximum relative reconstruction error: 5e-4
*/
MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
/**
* Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
* Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
* Assumes N is in a valid mantissa precision range, which is 1..23
*/
MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
/**
* Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
* Preserves Inf/NaN, flushes denormals to zero
*/
MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
/**
* Set allocation callbacks
@@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
* Note that all algorithms only allocate memory for temporary use.
* allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
*/
MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
#ifdef __cplusplus
} /* extern "C" */
#endif
/* Quantization into commonly supported data formats */
/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
#ifdef __cplusplus
/**
* Quantize a float in [0..1] range into an N-bit fixed point unorm value
@@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N);
* Maximum reconstruction error: 1/2^N
*/
inline int meshopt_quantizeSnorm(float v, int N);
/**
* Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
* Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
* Representable magnitude range: [6e-5; 65504]
* Maximum relative reconstruction error: 5e-4
*/
MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
/**
* Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
* Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
* Assumes N is in a valid mantissa precision range, which is 1..23
*/
MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
/**
* Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
* Preserves Inf/NaN, flushes denormals to zero
*/
MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
#endif
/**
@@ -631,6 +863,10 @@ template <typename T>
inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
template <typename T>
inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
template <typename F>
inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
template <typename T, typename F>
inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
template <typename T>
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
template <typename T>
@@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
template <typename T>
inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template <typename T>
inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
template <typename T>
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
template <typename T>
inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -661,29 +899,44 @@ template <typename T>
inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
template <typename T>
inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
template <typename T>
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
template <typename T>
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
template <typename T>
inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
template <typename T>
inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
template <typename T>
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
template <typename T>
inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
template <typename T>
inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template <typename T>
inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template <typename T>
inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
template <typename T>
inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
template <typename T>
inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
template <typename T>
inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
template <typename T>
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
template <typename T>
inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
template <typename T>
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
#endif
@@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N)
class meshopt_Allocator
{
public:
template <typename T>
struct StorageT
struct Storage
{
static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
};
typedef StorageT<void> Storage;
#ifdef MESHOPTIMIZER_ALLOC_EXPORT
MESHOPTIMIZER_API static Storage& storage();
#else
static Storage& storage()
{
static Storage s = {::operator new, ::operator delete };
return s;
}
#endif
meshopt_Allocator()
: blocks()
, count(0)
: blocks()
, count(0)
{
}
~meshopt_Allocator()
{
for (size_t i = count; i > 0; --i)
Storage::deallocate(blocks[i - 1]);
storage().deallocate(blocks[i - 1]);
}
template <typename T> T* allocate(size_t size)
template <typename T>
T* allocate(size_t size)
{
assert(count < sizeof(blocks) / sizeof(blocks[0]));
T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
T* result = static_cast<T*>(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
blocks[count++] = result;
return result;
}
@@ -749,7 +1010,7 @@ public:
void deallocate(void* ptr)
{
assert(count > 0 && blocks[count - 1] == ptr);
Storage::deallocate(ptr);
storage().deallocate(ptr);
count--;
}
@@ -757,10 +1018,6 @@ private:
void* blocks[24];
size_t count;
};
// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
template <typename T> void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
template <typename T> void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
#endif
/* Inline implementation for C++ templated wrappers */
@@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter<T, false>
{
size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
data = static_cast<unsigned int*>(meshopt_Allocator::storage().allocate(size));
if (input)
{
@@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter<T, false>
result[i] = T(data[i]);
}
meshopt_Allocator::Storage::deallocate(data);
meshopt_Allocator::storage().deallocate(data);
}
};
@@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const
return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
}
template <typename F>
inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
{
struct Call
{
static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
};
return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
}
template <typename T, typename F>
inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
{
struct Call
{
static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
};
meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
}
template <typename T>
inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
{
@@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
template <typename T>
inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
size_t bound = vertex_count + (index_count / 3);
assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
(void)bound;
return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
}
template <typename T>
inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
{
@@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
}
inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
{
return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
}
template <typename T>
inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
{
@@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
}
template <typename T>
inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
{
meshopt_IndexAdapter<T> inout(indices, indices, index_count);
return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
}
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
}
template <typename T>
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
}
template <typename T>
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
}
template <typename T>
@@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
}
template <typename T>
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
}
template <typename T>
inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
}
template <typename T>
@@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
}
template <typename T>
inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
template <typename T>
@@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
}
template <typename T>
inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
}
template <typename T>
inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
{
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
}
template <typename T>
inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
}
template <typename T>
inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
{
meshopt_IndexAdapter<T> in(NULL, cluster_indices, total_index_count);
return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size);
}
template <typename T>
inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
@@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
#endif
/**
* Copyright (c) 2016-2024 Arseny Kapoulkine
* Copyright (c) 2016-2025 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation

View File

@@ -10,24 +10,24 @@
namespace meshopt
{
static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float mesh_centroid[3] = {};
for (size_t i = 0; i < index_count; ++i)
for (size_t i = 0; i < vertex_count; ++i)
{
const float* p = vertex_positions + vertex_stride_float * indices[i];
const float* p = vertex_positions + vertex_stride_float * i;
mesh_centroid[0] += p[0];
mesh_centroid[1] += p[1];
mesh_centroid[2] += p[2];
}
mesh_centroid[0] /= index_count;
mesh_centroid[1] /= index_count;
mesh_centroid[2] /= index_count;
mesh_centroid[0] /= float(vertex_count);
mesh_centroid[1] /= float(vertex_count);
mesh_centroid[2] /= float(vertex_count);
for (size_t cluster = 0; cluster < cluster_count; ++cluster)
{
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
// fill sort data
float* sort_data = allocator.allocate<float>(cluster_count);
calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
// sort clusters using sort data
unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);

View File

@@ -0,0 +1,624 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <math.h>
#include <string.h>
// This work is based on:
// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
namespace meshopt
{
// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
const int kMergeDepthCutoff = 40;
struct ClusterAdjacency
{
unsigned int* offsets;
unsigned int* clusters;
unsigned int* shared;
};
static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
{
(void)vertex_count;
(void)total_index_count;
size_t cluster_start = 0;
size_t cluster_write = 0;
for (size_t i = 0; i < cluster_count; ++i)
{
offsets[i] = unsigned(cluster_write);
// copy cluster indices, skipping duplicates
for (size_t j = 0; j < cluster_index_counts[i]; ++j)
{
unsigned int v = cluster_indices[cluster_start + j];
assert(v < vertex_count);
data[cluster_write] = v;
cluster_write += 1 - used[v];
used[v] = 1;
}
// reset used flags for the next cluster
for (size_t j = offsets[i]; j < cluster_write; ++j)
used[data[j]] = 0;
cluster_start += cluster_index_counts[i];
}
assert(cluster_start == total_index_count);
assert(cluster_write <= total_index_count);
offsets[cluster_count] = unsigned(cluster_write);
}
static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float center[3] = {0, 0, 0};
// approximate center of the cluster by averaging all vertex positions
for (size_t j = 0; j < index_count; ++j)
{
const float* p = vertex_positions + indices[j] * vertex_stride_float;
center[0] += p[0];
center[1] += p[1];
center[2] += p[2];
}
// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
if (index_count)
{
center[0] /= float(index_count);
center[1] /= float(index_count);
center[2] /= float(index_count);
}
// compute radius of the bounding sphere for each cluster
float radiussq = 0;
for (size_t j = 0; j < index_count; ++j)
{
const float* p = vertex_positions + indices[j] * vertex_stride_float;
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
radiussq = radiussq < d2 ? d2 : radiussq;
}
memcpy(out_center, center, sizeof(center));
return sqrtf(radiussq);
}
static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
{
unsigned int* ref_offsets = allocator.allocate<unsigned int>(vertex_count + 1);
// compute number of clusters referenced by each vertex
memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
for (size_t i = 0; i < cluster_count; ++i)
{
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
ref_offsets[cluster_indices[j]]++;
}
// compute (worst-case) number of adjacent clusters for each cluster
size_t total_adjacency = 0;
for (size_t i = 0; i < cluster_count; ++i)
{
size_t count = 0;
// worst case is every vertex has a disjoint cluster list
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
count += ref_offsets[cluster_indices[j]] - 1;
// ... but only every other cluster can be adjacent in the end
total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
}
// we can now allocate adjacency buffers
adjacency.offsets = allocator.allocate<unsigned int>(cluster_count + 1);
adjacency.clusters = allocator.allocate<unsigned int>(total_adjacency);
adjacency.shared = allocator.allocate<unsigned int>(total_adjacency);
// convert ref counts to offsets
size_t total_refs = 0;
for (size_t i = 0; i < vertex_count; ++i)
{
size_t count = ref_offsets[i];
ref_offsets[i] = unsigned(total_refs);
total_refs += count;
}
unsigned int* ref_data = allocator.allocate<unsigned int>(total_refs);
// fill cluster refs for each vertex
for (size_t i = 0; i < cluster_count; ++i)
{
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
}
// after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
ref_offsets[0] = 0;
// fill cluster adjacency for each cluster...
adjacency.offsets[0] = 0;
for (size_t i = 0; i < cluster_count; ++i)
{
unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
unsigned int* shd = adjacency.shared + adjacency.offsets[i];
size_t count = 0;
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
{
unsigned int v = cluster_indices[j];
// merge the entire cluster list of each vertex into current list
for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
{
unsigned int c = ref_data[k];
assert(c < cluster_count);
if (c == unsigned(i))
continue;
// if the cluster is already in the list, increment the shared count
bool found = false;
for (size_t l = 0; l < count; ++l)
if (adj[l] == c)
{
found = true;
shd[l]++;
break;
}
// .. or append a new cluster
if (!found)
{
adj[count] = c;
shd[count] = 1;
count++;
}
}
}
// mark the end of the adjacency list; the next cluster will start there as well
adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
}
assert(adjacency.offsets[cluster_count] <= total_adjacency);
// ref_offsets can't be deallocated as it was allocated before adjacency
allocator.deallocate(ref_data);
}
struct ClusterGroup
{
int group;
int next;
unsigned int size; // 0 unless root
unsigned int vertices;
float center[3];
float radius;
};
struct GroupOrder
{
unsigned int id;
int order;
};
static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
{
// insert a new element at the end (breaks heap invariant)
heap[size++] = item;
// bubble up the new element to its correct position
size_t i = size - 1;
while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
{
size_t p = (i - 1) / 2;
GroupOrder temp = heap[i];
heap[i] = heap[p];
heap[p] = temp;
i = p;
}
}
static GroupOrder heapPop(GroupOrder* heap, size_t size)
{
assert(size > 0);
GroupOrder top = heap[0];
// move the last element to the top (breaks heap invariant)
heap[0] = heap[--size];
// bubble down the new top element to its correct position
size_t i = 0;
while (i * 2 + 1 < size)
{
// find the smallest child
size_t j = i * 2 + 1;
j += (j + 1 < size && heap[j + 1].order < heap[j].order);
// if the parent is already smaller than both children, we're done
if (heap[j].order >= heap[i].order)
break;
// otherwise, swap the parent and child and continue
GroupOrder temp = heap[i];
heap[i] = heap[j];
heap[j] = temp;
i = j;
}
return top;
}
static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
{
unsigned int total = 0;
for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
{
for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
if (adjacency.clusters[adj] == unsigned(i2))
{
total += adjacency.shared[adj];
break;
}
}
return total;
}
static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
{
float r1 = target.radius, r2 = source.radius;
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
float d = sqrtf(dx * dx + dy * dy + dz * dz);
if (d + r1 < r2)
{
target.center[0] = source.center[0];
target.center[1] = source.center[1];
target.center[2] = source.center[2];
target.radius = source.radius;
return;
}
if (d + r2 > r1)
{
float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
target.center[0] += dx * k;
target.center[1] += dy * k;
target.center[2] += dz * k;
target.radius = (d + r2 + r1) / 2;
}
}
static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
{
float r1 = target.radius, r2 = source.radius;
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
float d = sqrtf(dx * dx + dy * dy + dz * dz);
float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
return mr > 0 ? r1 / mr : 0.f;
}
static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
{
assert(groups[id].size > 0);
float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
int best_group = -1;
float best_score = 0;
for (int ci = id; ci >= 0; ci = groups[ci].next)
{
for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
{
int other = groups[adjacency.clusters[adj]].group;
if (other < 0)
continue;
assert(groups[other].size > 0);
if (groups[id].size + groups[other].size > max_partition_size)
continue;
unsigned int shared = countShared(groups, id, other, adjacency);
float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
// normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
// incorporate spatial score to favor merging nearby groups
if (use_bounds)
score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
if (score > best_score)
{
best_group = other;
best_score = score;
}
}
}
return best_group;
}
static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
{
for (size_t i = 0; i < count; ++i)
{
unsigned int id = order[i];
if (groups[id].size == 0 || groups[id].size >= target_partition_size)
continue;
float best_score = -1.f;
int best_group = -1;
for (size_t j = 0; j < count; ++j)
{
unsigned int other = order[j];
if (id == other || groups[other].size == 0)
continue;
if (groups[id].size + groups[other].size > max_partition_size)
continue;
// favor merging nearby groups
float score = boundsScore(groups[id], groups[other]);
if (score > best_score)
{
best_score = score;
best_group = other;
}
}
// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
if (best_group != -1)
{
// combine groups by linking them together
unsigned int tail = best_group;
while (groups[tail].next >= 0)
tail = groups[tail].next;
groups[tail].next = id;
// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
groups[best_group].size += groups[id].size;
groups[id].size = 0;
// merge bounding spheres
mergeBounds(groups[best_group], groups[id]);
groups[id].radius = 0.f;
}
}
}
static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
{
size_t m = 0;
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
for (size_t i = 0; i < count; ++i)
{
float v = groups[order[i]].center[axis];
// swap(m, i) unconditionally
unsigned int t = order[m];
order[m] = order[i];
order[i] = t;
// when v >= pivot, we swap i with m without advancing it, preserving invariants
m += v < pivot;
}
return m;
}
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
{
size_t total = 0;
for (size_t i = 0; i < count; ++i)
total += groups[order[i]].size;
if (total <= max_partition_size || count <= leaf_size)
return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
float mean[3] = {};
float vars[3] = {};
float runc = 1, runs = 1;
// gather statistics on the points in the subtree using Welford's algorithm
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
{
const float* point = groups[order[i]].center;
for (int k = 0; k < 3; ++k)
{
float delta = point[k] - mean[k];
mean[k] += delta * runs;
vars[k] += delta * (point[k] - mean[k]);
}
}
// split axis is one where the variance is largest
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
float split = mean[axis];
size_t middle = mergePartition(order, count, groups, axis, split);
// enforce balance for degenerate partitions
// this also ensures recursion depth is bounded on pathological inputs
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
middle = count / 2;
// recursion depth is logarithmic and bounded due to max depth check above
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
}
} // namespace meshopt
size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
{
using namespace meshopt;
assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(target_partition_size > 0);
size_t max_partition_size = target_partition_size + target_partition_size / 3;
meshopt_Allocator allocator;
unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
memset(used, 0, vertex_count);
unsigned int* cluster_newindices = allocator.allocate<unsigned int>(total_index_count);
unsigned int* cluster_offsets = allocator.allocate<unsigned int>(cluster_count + 1);
// make new cluster index list that filters out duplicate indices
filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
cluster_indices = cluster_newindices;
// build cluster adjacency along with edge weights (shared vertex count)
ClusterAdjacency adjacency = {};
buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
size_t pending = 0;
// create a singleton group for each cluster and order them by priority
for (size_t i = 0; i < cluster_count; ++i)
{
groups[i].group = int(i);
groups[i].next = -1;
groups[i].size = 1;
groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
assert(groups[i].vertices > 0);
// compute bounding sphere for each cluster if positions are provided
if (vertex_positions)
groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
GroupOrder item = {};
item.id = unsigned(i);
item.order = groups[i].vertices;
heapPush(order, pending++, item);
}
// iteratively merge the smallest group with the best group
while (pending)
{
GroupOrder top = heapPop(order, pending--);
// this group was merged into another group earlier
if (groups[top.id].size == 0)
continue;
// disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
for (int i = top.id; i >= 0; i = groups[i].next)
{
assert(groups[i].group == int(top.id));
groups[i].group = -1;
}
// the group is large enough, emit as is
if (groups[top.id].size >= target_partition_size)
continue;
int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
// we can't grow the group any more, emit as is
if (best_group == -1)
continue;
// compute shared vertices to adjust the total vertices estimate after merging
unsigned int shared = countShared(groups, top.id, best_group, adjacency);
// combine groups by linking them together
unsigned int tail = top.id;
while (groups[tail].next >= 0)
tail = groups[tail].next;
groups[tail].next = best_group;
// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
groups[top.id].size += groups[best_group].size;
groups[top.id].vertices += groups[best_group].vertices;
groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
groups[best_group].size = 0;
groups[best_group].vertices = 0;
// merge bounding spheres if bounds are available
if (vertex_positions)
{
mergeBounds(groups[top.id], groups[best_group]);
groups[best_group].radius = 0;
}
// re-associate all clusters back to the merged group
for (int i = top.id; i >= 0; i = groups[i].next)
groups[i].group = int(top.id);
top.order = groups[top.id].vertices;
heapPush(order, pending++, top);
}
// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
if (vertex_positions)
{
unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
size_t merge_offset = 0;
for (size_t i = 0; i < cluster_count; ++i)
if (groups[i].size)
merge_order[merge_offset++] = unsigned(i);
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
}
// output each remaining group
size_t next_group = 0;
for (size_t i = 0; i < cluster_count; ++i)
{
if (groups[i].size == 0)
continue;
for (int j = int(i); j >= 0; j = groups[j].next)
destination[j] = unsigned(next_group);
next_group++;
}
assert(next_group <= cluster_count);
return next_group;
}

View File

@@ -18,14 +18,6 @@ struct OverdrawBuffer
unsigned int overdraw[kViewport][kViewport][2];
};
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b))
#endif
static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
{
// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
@@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1,
float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
float invdet = (det == 0) ? 0 : 1 / det;
dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;
return det;
}
@@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
// bounding rectangle, clipped against viewport
// since we rasterize pixels with covered centers, min >0.5 should round up
// as for max, due to top-left filling convention we will never rasterize right/bottom edges
// so max >= 0.5 should round down
int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
// so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
int minx = X1 < X2 ? X1 : X2;
minx = minx < X3 ? minx : X3;
minx = (minx + 7) >> 4;
minx = minx < 0 ? 0 : minx;
int miny = Y1 < Y2 ? Y1 : Y2;
miny = miny < Y3 ? miny : Y3;
miny = (miny + 7) >> 4;
miny = miny < 0 ? 0 : miny;
int maxx = X1 > X2 ? X1 : X2;
maxx = maxx > X3 ? maxx : X3;
maxx = (maxx + 7) >> 4;
maxx = maxx > kViewport ? kViewport : maxx;
int maxy = Y1 > Y2 ? Y1 : Y2;
maxy = maxy > Y3 ? maxy : Y3;
maxy = (maxy + 7) >> 4;
maxy = maxy > kViewport ? kViewport : maxy;
// deltas, 28.4 fixed point
int DX12 = X1 - X2;
@@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
}
}
} // namespace meshopt
meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
meshopt_OverdrawStatistics result = {};
float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
@@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
for (int j = 0; j < 3; ++j)
{
minv[j] = min(minv[j], v[j]);
maxv[j] = max(maxv[j], v[j]);
float vj = v[j];
minv[j] = minv[j] > vj ? vj : minv[j];
maxv[j] = maxv[j] < vj ? vj : maxv[j];
}
}
float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
float scale = kViewport / extent;
float extent = 0.f;
float* triangles = allocator.allocate<float>(index_count * 3);
extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
float scale = kViewport / extent;
for (size_t i = 0; i < index_count; ++i)
{
@@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
}
return extent;
}
static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
{
for (size_t i = 0; i < index_count; i += 3)
{
const float* vn0 = &triangles[3 * (i + 0)];
const float* vn1 = &triangles[3 * (i + 1)];
const float* vn2 = &triangles[3 * (i + 2)];
switch (axis)
{
case 0:
rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
break;
case 1:
rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
break;
case 2:
rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
break;
}
}
}
} // namespace meshopt
meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
meshopt_OverdrawStatistics result = {};
float* triangles = allocator.allocate<float>(index_count * 3);
transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
for (int axis = 0; axis < 3; ++axis)
{
memset(buffer, 0, sizeof(OverdrawBuffer));
for (size_t i = 0; i < index_count; i += 3)
{
const float* vn0 = &triangles[3 * (i + 0)];
const float* vn1 = &triangles[3 * (i + 1)];
const float* vn2 = &triangles[3 * (i + 2)];
switch (axis)
{
case 0:
rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
break;
case 1:
rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
break;
case 2:
rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
break;
}
}
rasterizeTriangles(buffer, triangles, index_count, axis);
for (int y = 0; y < kViewport; ++y)
for (int x = 0; x < kViewport; ++x)
@@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
return result;
}
meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
meshopt_CoverageStatistics result = {};
float* triangles = allocator.allocate<float>(index_count * 3);
float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
for (int axis = 0; axis < 3; ++axis)
{
memset(buffer, 0, sizeof(OverdrawBuffer));
rasterizeTriangles(buffer, triangles, index_count, axis);
unsigned int covered = 0;
for (int y = 0; y < kViewport; ++y)
for (int x = 0; x < kViewport; ++x)
covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
result.coverage[axis] = float(covered) / float(kViewport * kViewport);
}
result.extent = extent;
return result;
}

File diff suppressed because it is too large Load Diff

View File

@@ -10,18 +10,19 @@
namespace meshopt
{
// "Insert" two 0 bits after each of the 10 low bits of x
inline unsigned int part1By2(unsigned int x)
// "Insert" two 0 bits after each of the 20 low bits of x
inline unsigned long long part1By2(unsigned long long x)
{
x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
x &= 0x000fffffull; // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x000f00f00f00f00full; // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull; // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x0249249249249249ull; // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
@@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat
extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
float scale = extent == 0 ? 0.f : 1.f / extent;
// rescale each axis to 16 bits to get 48-bit Morton codes
float scale = extent == 0 ? 0.f : 65535.f / extent;
// generate Morton order based on the position inside a unit cube
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions_data + i * vertex_stride_float;
int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
int x = int((v[0] - minv[0]) * scale + 0.5f);
int y = int((v[1] - minv[1]) * scale + 0.5f);
int z = int((v[2] - minv[2]) * scale + 0.5f);
result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
if (morton)
result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
else
result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
}
}
static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
{
unsigned int hist[1024];
memset(hist, 0, sizeof(hist));
// compute 3 10-bit histograms in parallel
// compute histogram (assume keys are 10-bit)
for (size_t i = 0; i < count; ++i)
{
unsigned int id = data[i];
hist[keys[i]]++;
hist[(id >> 0) & 1023][0]++;
hist[(id >> 10) & 1023][1]++;
hist[(id >> 20) & 1023][2]++;
}
unsigned int sumx = 0, sumy = 0, sumz = 0;
unsigned int sum = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 1024; ++i)
{
unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
hist[i][0] = sumx;
hist[i][1] = sumy;
hist[i][2] = sumz;
sumx += hx;
sumy += hy;
sumz += hz;
unsigned int h = hist[i];
hist[i] = sum;
sum += h;
}
assert(sumx == count && sumy == count && sumz == count);
assert(sum == count);
// reorder values
for (size_t i = 0; i < count; ++i)
{
unsigned int id = keys[source[i]];
destination[hist[id]++] = source[i];
}
}
static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
{
int bitoff = pass * 10;
memset(hist, 0, sizeof(hist));
// compute 2 8-bit histograms in parallel
for (size_t i = 0; i < count; ++i)
{
unsigned long long id = data[i];
hist[(id >> 0) & 255][0]++;
hist[(id >> 8) & 255][1]++;
}
unsigned int sum0 = 0, sum1 = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 256; ++i)
{
unsigned int h0 = hist[i][0], h1 = hist[i][1];
hist[i][0] = sum0;
hist[i][1] = sum1;
sum0 += h0;
sum1 += h1;
}
assert(sum0 == count && sum1 == count);
}
static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
{
int bitoff = pass * 8;
for (size_t i = 0; i < count; ++i)
{
unsigned int id = (keys[source[i]] >> bitoff) & 1023;
unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
destination[hist[id][pass]++] = source[i];
}
}
static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
{
size_t l = 0, r = split;
for (size_t i = 0; i < count; ++i)
{
unsigned char side = sides[order[i]];
target[side ? r : l] = order[i];
l += 1;
l -= side;
r += side;
}
assert(l == split && r == count);
}
static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
{
if (count <= cluster_size)
{
memcpy(destination, orderx, count * sizeof(unsigned int));
return;
}
unsigned int* axes[3] = {orderx, ordery, orderz};
int bestk = -1;
unsigned int bestdim = 0;
for (int k = 0; k < 3; ++k)
{
const unsigned int mask = (1 << 20) - 1;
unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
if (dim >= bestdim)
{
bestk = k;
bestdim = dim;
}
}
assert(bestk >= 0);
// split roughly in half, with the left split always being aligned to cluster size
size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
assert(split > 0 && split < count);
// mark sides of split for partitioning
unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
for (size_t i = 0; i < split; ++i)
sides[axes[bestk][i]] = 0;
for (size_t i = split; i < count; ++i)
sides[axes[bestk][i]] = 1;
// partition all axes into two sides, maintaining order
unsigned int* temp = static_cast<unsigned int*>(scratch);
for (int k = 0; k < 3; ++k)
{
if (k == bestk)
continue;
unsigned int* axis = axes[k];
memcpy(temp, axis, sizeof(unsigned int) * count);
partitionPoints(axis, temp, sides, split, count);
}
// recursion depth is logarithmic and bounded as we always split in approximately half
splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
}
} // namespace meshopt
void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
meshopt_Allocator allocator;
unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
unsigned int hist[1024][3];
computeHistogram(hist, keys, vertex_count);
unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 2b for keys
unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
destination[i] = unsigned(i);
// 3-pass radix sort computes the resulting order into scratch
radixPass(scratch, destination, keys, vertex_count, hist, 0);
radixPass(destination, scratch, keys, vertex_count, hist, 1);
radixPass(scratch, destination, keys, vertex_count, hist, 2);
unsigned int* order[] = {scratch, destination};
// 5-pass radix sort computes the resulting order into scratch
for (int k = 0; k < 5; ++k)
{
// copy 10-bit key segments into keyk to reduce cache pressure during radix pass
for (size_t i = 0; i < vertex_count; ++i)
keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
}
// since our remap table is mapping old=>new, we need to reverse it
for (size_t i = 0; i < vertex_count; ++i)
@@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
destination[r * 3 + 2] = c;
}
}
void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
{
using namespace meshopt;
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(cluster_size > 0);
meshopt_Allocator allocator;
unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
unsigned int* order = allocator.allocate<unsigned int>(vertex_count * 3);
unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
unsigned short* keyk = reinterpret_cast<unsigned short*>(scratch + vertex_count);
for (int k = 0; k < 3; ++k)
{
// copy 16-bit key segments into keyk to reduce cache pressure during radix pass
for (size_t i = 0; i < vertex_count; ++i)
keyk[i] = (unsigned short)(keys[i] >> (k * 20));
unsigned int hist[256][2];
computeHistogram(hist, keyk, vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
order[k * vertex_count + i] = unsigned(i);
radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
}
splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
}

View File

@@ -10,14 +10,14 @@
namespace meshopt
{
static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
{
unsigned int index = 0;
unsigned int iv = ~0u;
for (size_t i = 0; i < buffer_size; ++i)
{
unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
if (v < iv)
@@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
size_t strip_size = 0;
// compute vertex valence; this is used to prioritize starting triangle for strips
unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
memset(valence, 0, vertex_count * sizeof(unsigned int));
// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
memset(valence, 0, vertex_count);
for (size_t i = 0; i < index_count; ++i)
{
@@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
{
// if we didn't find anything, we need to find the next new triangle
// we use a heuristic to maximize the strip length
unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
unsigned int i = findStripFirst(buffer, buffer_size, valence);
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
// ordered removal from the buffer

File diff suppressed because it is too large Load Diff

View File

@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
static void decodeFilterQuat(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; ++i)
{
// recover scale from the high byte of the component
int sf = data[i * 4 + 3] | 3;
float ss = scale / float(sf);
float s = float(sf);
// convert x/y/z to [-1..1] (scaled...)
float x = float(data[i * 4 + 0]) * ss;
float y = float(data[i * 4 + 1]) * ss;
float z = float(data[i * 4 + 2]) * ss;
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
float x = float(data[i * 4 + 0]);
float y = float(data[i * 4 + 1]);
float z = float(data[i * 4 + 2]);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float ww = 1.f - x * x - y * y - z * z;
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
float ws = s * s;
float ww = ws * 2.f - x * x - y * y - z * z;
float w = sqrtf(ww >= 0.f ? ww : 0.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
float ss = scale / s;
// rounded signed float->int
int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
int wf = int(w * 32767.f + 0.5f);
int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
int wf = int(w * ss + 0.5f);
int qc = data[i * 4 + 3] & 3;
@@ -165,6 +170,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
data[i] = u.ui;
}
}
template <typename ST, typename T>
static void decodeFilterColor(T* data, size_t count)
{
const float max = float((1 << (sizeof(T) * 8)) - 1);
for (size_t i = 0; i < count; ++i)
{
// recover scale from alpha high bit
int as = data[i * 4 + 3];
as |= as >> 1;
as |= as >> 2;
as |= as >> 4;
as |= as >> 8; // noop for 8-bit
// convert to RGB in fixed point (co/cg are sign extended)
int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
int r = y + co - cg;
int g = y + cg;
int b = y - co - cg;
// expand alpha by one bit to match other components
int a = data[i * 4 + 3];
a = ((a << 1) & as) | (a & 1);
// compute scaling factor
float ss = max / float(as);
// rounded float->int
int rf = int(float(r) * ss + 0.5f);
int gf = int(float(g) * ss + 0.5f);
int bf = int(float(b) * ss + 0.5f);
int af = int(float(a) * ss + 0.5f);
data[i * 4 + 0] = T(rf);
data[i * 4 + 1] = T(gf);
data[i * 4 + 2] = T(bf);
data[i * 4 + 3] = T(af);
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
@@ -201,7 +247,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
#endif
#ifdef SIMD_SSE
static void decodeFilterOctSimd(signed char* data, size_t count)
static void decodeFilterOctSimd8(signed char* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
@@ -246,7 +292,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
}
}
static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterOctSimd16(short* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
@@ -295,8 +341,9 @@ static void decodeFilterOctSimd(short* data, size_t count)
__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
// patch in .w
res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
__m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0);
res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw));
res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
@@ -305,7 +352,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -324,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
__m128 s = _mm_cvtepi32_ps(sf);
// convert x/y/z to [-1..1] (scaled...)
__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
__m128 x = _mm_cvtepi32_ps(xf);
__m128 y = _mm_cvtepi32_ps(yf);
__m128 z = _mm_cvtepi32_ps(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
__m128 s = _mm_set1_ps(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
// rounded signed float->int
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
// mix x/z and w/y to make 16-bit unpack easier
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -385,6 +435,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
}
}
static void decodeFilterColorSimd8(unsigned char* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
__m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
__m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
__m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
__m128i af = _mm_srli_epi32(c4, 24);
// recover scale from alpha high bit
__m128i as = af;
as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
// expand alpha by one bit to match other components
af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
// compute scaling factor
__m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
// convert to RGB in fixed point
__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
__m128i gf = _mm_add_epi32(yf, cgf);
__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
// rounded signed float->int
__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
// repack rgba into final value
__m128i res = rr;
res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
}
}
static void decodeFilterColorSimd16(unsigned short* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
__m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
__m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
// gather both y/co 16-bit pairs in each 32-bit lane
__m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
__m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
__m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
__m128i cof = _mm_srai_epi32(c4_yco, 16);
__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
__m128i af = _mm_srli_epi32(c4_cga, 16);
// recover scale from alpha high bit
__m128i as = af;
as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
// expand alpha by one bit to match other components
af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
// compute scaling factor
__m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
// convert to RGB in fixed point
__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
__m128i gf = _mm_add_epi32(yf, cgf);
__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
// rounded signed float->int
__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
// mix r/b and g/a to make 16-bit unpack easier
__m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
__m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
// pack r/g/b/a using 16-bit unpacks
__m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
__m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
}
}
#endif
#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
@@ -401,10 +550,17 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
return vmulq_f32(x, r);
}
#ifndef __ARM_FEATURE_FMA
inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
{
return vaddq_f32(x, vmulq_f32(y, z));
}
#endif
#endif
#ifdef SIMD_NEON
static void decodeFilterOctSimd(signed char* data, size_t count)
static void decodeFilterOctSimd8(signed char* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
@@ -431,29 +587,27 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
float32x4_t rl = vrsqrteq_f32(ll);
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
// combine xr/yr/zr into final value
int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
}
}
static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterOctSimd16(short* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
@@ -485,21 +639,25 @@ static void decodeFilterOctSimd(short* data, size_t count)
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
#if !defined(__aarch64__) && !defined(_M_ARM64)
float32x4_t rl = vrsqrteq_f32(ll);
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
#else
float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
#endif
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
// mix x/z and y/0 to make 16-bit unpack easier
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
@@ -517,7 +675,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -536,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
float32x4_t s = vcvtq_f32_s32(sf);
// convert x/y/z to [-1..1] (scaled...)
float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
float32x4_t x = vcvtq_f32_s32(xf);
float32x4_t y = vcvtq_f32_s32(yf);
float32x4_t z = vcvtq_f32_s32(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
float32x4_t ws = vmulq_f32(s, s);
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
float32x4_t s = vdupq_n_f32(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
// mix x/z and w/y to make 16-bit unpack easier
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
// store results to stack so that we can rotate using scalar instructions
// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
volatile uint64_t res[4];
vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
// rotate and store
uint64_t* out = (uint64_t*)&data[i * 4];
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
}
}
@@ -595,10 +762,112 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
}
}
static void decodeFilterColorSimd8(unsigned char* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
int32x4_t c4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
// recover scale from alpha high bit
int32x4_t as = af;
as = vorrq_s32(as, vshrq_n_s32(as, 1));
as = vorrq_s32(as, vshrq_n_s32(as, 2));
as = vorrq_s32(as, vshrq_n_s32(as, 4));
// expand alpha by one bit to match other components
af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
// compute scaling factor
float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
// convert to RGB in fixed point
int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
int32x4_t gf = vaddq_s32(yf, cgf);
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
// repack rgba into final value
int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
}
}
static void decodeFilterColorSimd16(unsigned short* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
int32x4_t c4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
int32x4_t c4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
// gather both y/co 16-bit pairs in each 32-bit lane
int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0];
int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1];
// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff));
int32x4_t cof = vshrq_n_s32(c4_yco, 16);
int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16);
int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16));
// recover scale from alpha high bit
int32x4_t as = af;
as = vorrq_s32(as, vshrq_n_s32(as, 1));
as = vorrq_s32(as, vshrq_n_s32(as, 2));
as = vorrq_s32(as, vshrq_n_s32(as, 4));
as = vorrq_s32(as, vshrq_n_s32(as, 8));
// expand alpha by one bit to match other components
af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
// compute scaling factor
float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as));
// convert to RGB in fixed point
int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
int32x4_t gf = vaddq_s32(yf, cgf);
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
// mix r/b and g/a to make 16-bit unpack easier
int32x4_t rbr = vsliq_n_s32(rr, br, 16);
int32x4_t gar = vsliq_n_s32(gr, ar, 16);
// pack r/g/b/a using 16-bit unpacks
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]);
vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
}
}
#endif
#ifdef SIMD_WASM
static void decodeFilterOctSimd(signed char* data, size_t count)
static void decodeFilterOctSimd8(signed char* data, size_t count)
{
const v128_t sign = wasm_f32x4_splat(-0.f);
@@ -647,10 +916,11 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
}
}
static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterOctSimd16(short* data, size_t count)
{
const v128_t sign = wasm_f32x4_splat(-0.f);
const v128_t zmask = wasm_i32x4_splat(0x7fff);
// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
volatile v128_t zmask = wasm_i32x4_splat(0x7fff);
for (size_t i = 0; i < count; i += 4)
{
@@ -711,7 +981,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
const float scale = 32767.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
@@ -730,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
v128_t s = wasm_f32x4_convert_i32x4(sf);
// convert x/y/z to [-1..1] (scaled...)
v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
v128_t x = wasm_f32x4_convert_i32x4(xf);
v128_t y = wasm_f32x4_convert_i32x4(yf);
v128_t z = wasm_f32x4_convert_i32x4(zf);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
// note: i32x4_max with 0 is equivalent to f32x4_max
v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
v128_t ws = wasm_f32x4_mul(s, s);
v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
v128_t s = wasm_f32x4_splat(32767.f);
// compute final scale; note that all computations above are unscaled
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);
// mix x/z and w/y to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
@@ -762,8 +1035,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
// compute component index shifted left by 4 (and moved into i32x4 slot)
// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
volatile v128_t cm = wasm_i32x4_shl(cf, 4);
v128_t cm = wasm_i32x4_shl(cf, 4);
// rotate and store
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
@@ -794,6 +1066,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
wasm_v128_store(&data[i], r);
}
}
static void decodeFilterColorSimd8(unsigned char* data, size_t count)
{
// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
volatile v128_t zero = wasm_i32x4_splat(0);
for (size_t i = 0; i < count; i += 4)
{
v128_t c4 = wasm_v128_load(&data[i * 4]);
// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff));
v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24);
v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24);
v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24));
// recover scale from alpha high bit
v128_t as = af;
as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
// expand alpha by one bit to match other components
af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
// compute scaling factor
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as));
// convert to RGB in fixed point
v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
v128_t gf = wasm_i32x4_add(yf, cgf);
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
// repack rgba into final value
v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff));
res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8));
res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16));
res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24));
wasm_v128_store(&data[i * 4], res);
}
}
static void decodeFilterColorSimd16(unsigned short* data, size_t count)
{
// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
volatile v128_t zero = wasm_i32x4_splat(0);
for (size_t i = 0; i < count; i += 4)
{
v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]);
v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]);
// gather both y/co 16-bit pairs in each 32-bit lane
v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1);
v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1);
// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff));
v128_t cof = wasm_i32x4_shr(c4_yco, 16);
v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16);
v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16));
// recover scale from alpha high bit
v128_t as = af;
as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
as = wasm_v128_or(as, wasm_i32x4_shr(as, 8));
// expand alpha by one bit to match other components
af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
// compute scaling factor
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as));
// convert to RGB in fixed point
v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
v128_t gf = wasm_i32x4_add(yf, cgf);
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
// mix r/b and g/a to make 16-bit unpack easier
v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16));
v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16));
// pack r/g/b/a using 16-bit unpacks
v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar);
v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar);
wasm_v128_store(&data[(i + 0) * 4], res_0);
wasm_v128_store(&data[(i + 2) * 4], res_1);
}
}
#endif
// optimized variant of frexp
@@ -807,7 +1190,7 @@ inline int optlog2(float v)
u.f = v;
// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
return v == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
}
// optimized variant of ldexp
@@ -833,9 +1216,9 @@ void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
if (stride == 4)
dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
dispatchSimd(decodeFilterOctSimd8, static_cast<signed char*>(buffer), count, 4);
else
dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
dispatchSimd(decodeFilterOctSimd16, static_cast<short*>(buffer), count, 4);
#else
if (stride == 4)
decodeFilterOct(static_cast<signed char*>(buffer), count);
@@ -871,10 +1254,29 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
#endif
}
void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
{
using namespace meshopt;
assert(stride == 4 || stride == 8);
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
if (stride == 4)
dispatchSimd(decodeFilterColorSimd8, static_cast<unsigned char*>(buffer), count, 4);
else
dispatchSimd(decodeFilterColorSimd16, static_cast<unsigned short*>(buffer), count, 4);
#else
if (stride == 4)
decodeFilterColor<signed char>(static_cast<unsigned char*>(buffer), count);
else
decodeFilterColor<short>(static_cast<unsigned short*>(buffer), count);
#endif
}
void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
{
assert(stride == 4 || stride == 8);
assert(bits >= 1 && bits <= 16);
assert(bits >= 2 && bits <= 16);
signed char* d8 = static_cast<signed char*>(destination);
short* d16 = static_cast<short*>(destination);
@@ -1010,6 +1412,20 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
component_exp[j] = (min_exp < e) ? e : min_exp;
}
}
else if (mode == meshopt_EncodeExpClamped)
{
for (size_t j = 0; j < stride_float; ++j)
{
int e = optlog2(v[j]);
component_exp[j] = (0 < e) ? e : 0;
}
}
else
{
// the code below assumes component_exp is initialized outside of the loop
assert(mode == meshopt_EncodeExpSharedComponent);
}
for (size_t j = 0; j < stride_float; ++j)
{
@@ -1020,7 +1436,6 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
// compute renormalized rounded mantissa for each component
int mmask = (1 << 24) - 1;
int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));
d[j] = (m & mmask) | (unsigned(exp) << 24);
@@ -1028,6 +1443,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
}
}
void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data)
{
assert(stride == 4 || stride == 8);
assert(bits >= 2 && bits <= 16);
unsigned char* d8 = static_cast<unsigned char*>(destination);
unsigned short* d16 = static_cast<unsigned short*>(destination);
for (size_t i = 0; i < count; ++i)
{
const float* c = &data[i * 4];
int fr = meshopt_quantizeUnorm(c[0], bits);
int fg = meshopt_quantizeUnorm(c[1], bits);
int fb = meshopt_quantizeUnorm(c[2], bits);
// YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers
int fco = (fr - fb) / 2;
int tmp = fb + fco;
int fcg = (fg - tmp) / 2;
int fy = tmp + fcg;
// validate that R/G/B can be reconstructed with K bit integers
assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits));
// alpha: K-1-bit encoding with high bit set to 1
int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1));
if (stride == 4)
{
d8[i * 4 + 0] = (unsigned char)(fy);
d8[i * 4 + 1] = (unsigned char)(fco);
d8[i * 4 + 2] = (unsigned char)(fcg);
d8[i * 4 + 3] = (unsigned char)(fa);
}
else
{
d16[i * 4 + 0] = (unsigned short)(fy);
d16[i * 4 + 1] = (unsigned short)(fco);
d16[i * 4 + 2] = (unsigned short)(fcg);
d16[i * 4 + 3] = (unsigned short)(fa);
}
}
}
#undef SIMD_SSE
#undef SIMD_NEON
#undef SIMD_WASM

View File

@@ -1,58 +0,0 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
{
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
meshopt_VertexFetchStatistics result = {};
unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
memset(vertex_visited, 0, vertex_count);
const size_t kCacheLine = 64;
const size_t kCacheSize = 128 * 1024;
// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
size_t cache[kCacheSize / kCacheLine] = {};
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
vertex_visited[index] = 1;
size_t start_address = index * vertex_size;
size_t end_address = start_address + vertex_size;
size_t start_tag = start_address / kCacheLine;
size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
assert(start_tag < end_tag);
for (size_t tag = start_tag; tag < end_tag; ++tag)
{
size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
// we store +1 since cache is filled with 0 by default
result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
cache[line] = tag + 1;
}
}
size_t unique_vertex_count = 0;
for (size_t i = 0; i < vertex_count; ++i)
unique_vertex_count += vertex_visited[i];
result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
return result;
}

View File

@@ -703,6 +703,8 @@ namespace Flax.Build.Bindings
else if (nativeType.EndsWith("[]"))
{
parameterMarshalType = $"MarshalUsing(typeof(FlaxEngine.Interop.ArrayMarshaller<,>))";
if (!parameterInfo.IsOut && !parameterInfo.IsRef)
parameterMarshalType += ", In"; // The usage of 'LibraryImportAttribute' does not follow recommendations. It is recommended to use explicit '[In]' and '[Out]' attributes on array parameters.
}
if (!string.IsNullOrEmpty(parameterMarshalType))

View File

@@ -18,6 +18,23 @@ namespace Flax.Deps.Dependencies
get => new[] { TargetPlatform.Windows };
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -30,7 +47,7 @@ namespace Flax.Deps.Dependencies
// Copy files
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
BuildStarted(platform, TargetArchitecture.x64);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.lib"), Path.Combine(depsFolder, "amd_ags_x64.lib"));
Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.dll"), Path.Combine(depsFolder, "amd_ags_x64.dll"));

View File

@@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Flax.Build;
namespace Flax.Deps.Dependencies
@@ -39,6 +40,36 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
case TargetPlatform.Linux:
return new[]
{
TargetArchitecture.x64,
//TargetArchitecture.ARM64,
};
case TargetPlatform.Mac:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -91,22 +122,22 @@ namespace Flax.Deps.Dependencies
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
var configuration = "Release";
var binariesWin = new[]
BuildStarted(platform, architecture);
switch (platform)
{
Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
};
case TargetPlatform.Windows:
{
var configuration = "Release";
var binariesWin = new[]
{
Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
};
// Build for Windows
File.Delete(Path.Combine(root, "CMakeCache.txt"));
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
{
// Build for Windows
File.Delete(Path.Combine(root, "CMakeCache.txt"));
var buildDir = Path.Combine(root, "build-" + architecture);
var solutionPath = Path.Combine(buildDir, "Assimp.sln");
SetupDirectory(buildDir, true);
@@ -116,42 +147,40 @@ namespace Flax.Deps.Dependencies
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in binariesWin)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, Path.GetFileName(file)));
break;
}
break;
}
case TargetPlatform.Linux:
{
var envVars = new Dictionary<string, string>
case TargetPlatform.Linux:
{
{ "CC", "clang-" + Configuration.LinuxClangMinVer },
{ "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
{ "CXX", "clang++-" + Configuration.LinuxClangMinVer },
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var envVars = new Dictionary<string, string>
{
{ "CC", "clang-" + Configuration.LinuxClangMinVer },
{ "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
{ "CXX", "clang-" + Configuration.LinuxClangMinVer },
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
// Build for Linux
RunCmake(root, platform, TargetArchitecture.x64, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
break;
}
case TargetPlatform.Mac:
{
// Build for Mac
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
// Build for Linux
File.Delete(Path.Combine(root, "CMakeCache.txt"));
RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
break;
}
case TargetPlatform.Mac:
{
// Build for Mac
File.Delete(Path.Combine(root, "CMakeCache.txt"));
RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig);
Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
Utilities.Run("make", "clean", null, root, Utilities.RunOptions.DefaultTool);
break;
}
}
break;
}
}
}

View File

@@ -28,6 +28,24 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -46,12 +64,12 @@ namespace Flax.Deps.Dependencies
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
@@ -61,7 +79,7 @@ namespace Flax.Deps.Dependencies
}
}
break;
}
}
}
}

View File

@@ -1,6 +1,5 @@
// Copyright (c) Wojciech Figat. All rights reserved.
using System;
using System.IO;
using System.Linq;
using Flax.Build;
@@ -31,22 +30,40 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
var sdk = WindowsPlatformBase.GetSDKs().Last();
var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
var sdk = WindowsPlatformBase.GetSDKs().Last();
var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
string dxilLocation = @$"{binLocation}\{architecture}\dxil.dll";
@@ -60,9 +77,9 @@ namespace Flax.Deps.Dependencies
string d3dcompilerLibLocation = @$"{sdkLibLocation}\{architecture}\d3dcompiler.lib";
Utilities.FileCopy(dxcompilerLibLocation, Path.Combine(depsFolder, Path.GetFileName(dxcompilerLibLocation)));
Utilities.FileCopy(d3dcompilerLibLocation, Path.Combine(depsFolder, "d3dcompiler_47.lib"));
break;
}
}
break;
}
}
}
}

View File

@@ -30,6 +30,30 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
case TargetPlatform.XboxOne:
case TargetPlatform.XboxScarlett:
return new[]
{
TargetArchitecture.x64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -47,44 +71,44 @@ namespace Flax.Deps.Dependencies
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in outputFileNames)
Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), configuration, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.UWP:
{
var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in outputFileNames)
Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.XboxOne:
case TargetPlatform.XboxScarlett:
{
var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
foreach (var file in outputFileNames)
Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
break;
}
}
break;
}
case TargetPlatform.UWP:
{
var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
foreach (var file in outputFileNames)
Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.XboxOne:
case TargetPlatform.XboxScarlett:
{
var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
foreach (var file in outputFileNames)
Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
break;
}
}
}

View File

@@ -0,0 +1,92 @@
// Copyright (c) Wojciech Figat. All rights reserved.
using System.IO;
using System.IO.Compression;
using Flax.Build;
namespace Flax.Deps.Dependencies
{
/// <summary>
/// Visual Studio EnvDTE COM library. https://learn.microsoft.com/en-us/dotnet/api/envdte?view=visualstudiosdk-2022
/// </summary>
/// <seealso cref="Flax.Deps.Dependency" />
class EnvDTE : Dependency
{
/// <inheritdoc />
public override TargetPlatform[] Platforms
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetPlatform.Windows,
};
default: return new TargetPlatform[0];
}
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
options.IntermediateFolder.Replace("/" + GetType().Name, "/Microsoft.VisualStudio.Setup.Configuration.Native");
// Get the source
var root = options.IntermediateFolder;
var packagePath = Path.Combine(root, $"package.zip");
if (!File.Exists(packagePath))
{
Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/Microsoft.VisualStudio.Setup.Configuration.Native/3.14.2075", packagePath);
}
var extractedPath = Path.Combine(root, "extracted");
if (!Directory.Exists(extractedPath))
{
using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
archive.ExtractToDirectory(extractedPath);
}
root = extractedPath;
foreach (var platform in options.Platforms)
{
foreach (var architecture in options.Architectures)
{
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
var bin = Path.Combine(root, "lib", "native", "v141", architecture.ToString().ToLower());
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
Utilities.FileCopy(Path.Combine(bin, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"), Path.Combine(depsFolder, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"));
var include = Path.Combine(root, "lib", "native", "include");
Utilities.FileCopy(Path.Combine(include, "Setup.Configuration.h"), Path.Combine(options.ThirdPartyFolder, "Microsoft.VisualStudio.Setup.Configuration.Native", "Setup.Configuration.h"));
break;
}
}
}
}
}
}
}

View File

@@ -36,6 +36,24 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{

View File

@@ -1,5 +1,6 @@
// Copyright (c) Wojciech Figat. All rights reserved.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
@@ -16,40 +17,6 @@ namespace Flax.Deps.Dependencies
{
private string root, nvCloth;
/// <inheritdoc />
public override TargetPlatform[] Platforms
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetPlatform.Windows,
TargetPlatform.XboxOne,
TargetPlatform.XboxScarlett,
TargetPlatform.PS4,
TargetPlatform.PS5,
TargetPlatform.Switch,
TargetPlatform.Android,
};
case TargetPlatform.Linux:
return new[]
{
TargetPlatform.Linux,
};
case TargetPlatform.Mac:
return new[]
{
TargetPlatform.Mac,
TargetPlatform.iOS,
};
default: return new TargetPlatform[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -59,41 +26,51 @@ namespace Flax.Deps.Dependencies
// Get the source
CloneGitRepoSingleBranch(root, "https://github.com/FlaxEngine/NvCloth.git", "master");
// Patch the CMakeLists.txt to support custom compilation flags
foreach (var os in new[] { "android", "ios", "linux", "mac", "windows", })
{
var filePath = Path.Combine(nvCloth, "compiler", "cmake", os, "CMakeLists.txt");
var appendLine = "SET(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${NVCLOTH_CXX_FLAGS}\")";
if (!File.ReadAllText(filePath).Contains(appendLine))
File.AppendAllText(filePath, Environment.NewLine + appendLine + Environment.NewLine);
}
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
Build(options, platform, TargetArchitecture.x64);
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.XboxOne:
case TargetPlatform.XboxScarlett:
Build(options, platform, TargetArchitecture.x64);
break;
case TargetPlatform.PS4:
case TargetPlatform.PS5:
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
Build(options, platform, TargetArchitecture.x64);
break;
case TargetPlatform.Switch:
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Android:
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Mac:
Build(options, platform, TargetArchitecture.x64);
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.iOS:
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Linux:
Build(options, platform, TargetArchitecture.x64);
break;
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
Build(options, platform, architecture);
break;
case TargetPlatform.XboxOne:
case TargetPlatform.XboxScarlett:
Build(options, platform, TargetArchitecture.x64);
break;
case TargetPlatform.PS4:
case TargetPlatform.PS5:
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
Build(options, platform, TargetArchitecture.x64);
break;
case TargetPlatform.Switch:
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Android:
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Mac:
Build(options, platform, architecture);
break;
case TargetPlatform.iOS:
Build(options, platform, TargetArchitecture.ARM64);
break;
case TargetPlatform.Linux:
Build(options, platform, architecture);
break;
}
}
}
@@ -110,7 +87,7 @@ namespace Flax.Deps.Dependencies
// Peek options
var binariesPrefix = string.Empty;
var binariesPostfix = string.Empty;
var cmakeArgs = "-DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
var cmakeName = string.Empty;
var buildFolder = Path.Combine(nvCloth, "compiler", platform.ToString() + '_' + architecture.ToString());
var envVars = new Dictionary<string, string>();
@@ -154,7 +131,7 @@ namespace Flax.Deps.Dependencies
}
break;
case TargetPlatform.Mac:
cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac";
cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
cmakeName = "mac";
binariesPrefix = "lib";
break;
@@ -164,7 +141,7 @@ namespace Flax.Deps.Dependencies
binariesPrefix = "lib";
break;
case TargetPlatform.Linux:
cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux";
cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
cmakeName = "linux";
binariesPrefix = "lib";
envVars.Add("CC", "clang-" + Configuration.LinuxClangMinVer);

View File

@@ -1,5 +1,5 @@
// Copyright (c) Wojciech Figat. All rights reserved.
//#define USE_GIT_REPOSITORY
using System;
using System.Collections.Generic;
using System.IO;
@@ -45,132 +45,75 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
case TargetPlatform.Linux:
return new[]
{
TargetArchitecture.x64,
//TargetArchitecture.ARM64,
};
case TargetPlatform.Mac:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
case TargetPlatform.iOS:
return new[]
{
TargetArchitecture.ARM64,
};
case TargetPlatform.Android:
return new[]
{
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
var root = options.IntermediateFolder;
var version = "1.24.3";
var configuration = "Release";
var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5";
var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL");
var noSSL = true; // OpenAL Soft website has broken certs
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
{
case TargetPlatform.Windows:
{
var binariesToCopy = new[]
{
"OpenAL32.lib",
"OpenAL32.dll",
};
// Get the source
CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
// Build for Win64 and ARM64
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
{
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\"");
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
}
#if false
// Get the binaries
var packagePath = Path.Combine(root, "package.zip");
if (!File.Exists(packagePath))
Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL);
using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
{
if (!Directory.Exists(root))
archive.ExtractToDirectory(root);
root = Path.Combine(root, archive.Entries.First().FullName);
}
// Deploy Win64 binaries
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll"));
Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib"));
// Deploy license
Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
// Deploy header files
var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
foreach (var file in files)
{
Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
}
#if !USE_GIT_REPOSITORY
if (options.Platforms.Contains(TargetPlatform.Windows))
#endif
break;
}
case TargetPlatform.Linux:
{
// Get the source
CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
}
#if !USE_GIT_REPOSITORY
else
{
// Get the source
var packagePath = Path.Combine(root, $"package-{version}.zip");
if (!File.Exists(packagePath))
{
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CC", "clang-" + Configuration.LinuxClangMinVer },
{ "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
{ "CXX", "clang++-" + Configuration.LinuxClangMinVer },
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
$"-DALSOFT_REQUIRE_OSS=ON " +
$"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
$"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
$"-DALSOFT_REQUIRE_JACK=ON " +
$"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
$"-DALSOFT_EMBED_HRTF_DATA=YES ";
// Get the source
var packagePath = Path.Combine(root, "package.zip");
File.Delete(packagePath);
Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
// Use separate build directory
root = Path.Combine(root, "openal-soft-" + version);
var buildDir = Path.Combine(root, "build");
SetupDirectory(buildDir, true);
// Build for Linux
Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
BuildCmake(buildDir, configuration, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.Android:
{
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES";
// Get the source
var packagePath = Path.Combine(root, "package.zip");
File.Delete(packagePath);
Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
{
// TODO: Maybe use PowerShell Expand-Archive instead?
var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
Utilities.Run(sevenZip, "x package.zip", null, root);
Utilities.Run(sevenZip, "x package", null, root);
@@ -179,89 +122,167 @@ namespace Flax.Deps.Dependencies
{
Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
}
// Use separate build directory
root = Path.Combine(root, "openal-soft-" + version);
var buildDir = Path.Combine(root, "build");
SetupDirectory(buildDir, true);
// Build
RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.Mac:
}
#endif
foreach (var platform in options.Platforms)
{
foreach (var architecture in options.Architectures)
{
var binariesToCopy = new[]
BuildStarted(platform, architecture);
switch (platform)
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
case TargetPlatform.Windows:
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
var binariesToCopy = new[]
{
"OpenAL32.lib",
"OpenAL32.dll",
};
// Get the source
var packagePath = Path.Combine(root, "package.zip");
File.Delete(packagePath);
Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
// Use separate build directory
root = Path.Combine(root, "openal-soft-" + version);
var buildDir = Path.Combine(root, "build");
// Build for Mac
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
{
// Build for Windows
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
SetupDirectory(buildDir, true);
RunCmake(buildDir, platform, architecture, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" " + cmakeArgs);
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
break;
}
case TargetPlatform.Linux:
{
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CC", "clang-" + Configuration.LinuxClangMinVer },
{ "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
{ "CXX", "clang++-" + Configuration.LinuxClangMinVer },
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
$"-DALSOFT_REQUIRE_OSS=ON " +
$"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
$"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
$"-DALSOFT_REQUIRE_JACK=ON " +
$"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
$"-DALSOFT_EMBED_HRTF_DATA=YES "
+ cmakeArgs;
// Use separate build directory
#if !USE_GIT_REPOSITORY
root = Path.Combine(root, "openal-soft-" + version);
#endif
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
SetupDirectory(buildDir, true);
// Build for Linux
RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, configuration, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.Android:
{
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = "-DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
// Use separate build directory
#if !USE_GIT_REPOSITORY
root = Path.Combine(root, "openal-soft-" + version);
#endif
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
SetupDirectory(buildDir, true);
// Build
RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
case TargetPlatform.Mac:
{
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
// Use separate build directory
#if !USE_GIT_REPOSITORY
root = Path.Combine(root, "openal-soft-" + version);
#endif
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
SetupDirectory(buildDir, true);
// Build for Mac
RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, architecture);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
break;
}
case TargetPlatform.iOS:
{
var binariesToCopy = new[]
case TargetPlatform.iOS:
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
var binariesToCopy = new[]
{
"libopenal.a",
};
var envVars = new Dictionary<string, string>
{
{ "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
};
var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
// Get the source
var packagePath = Path.Combine(root, "package.zip");
if (!File.Exists(packagePath))
{
Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
// Use separate build directory
#if !USE_GIT_REPOSITORY
root = Path.Combine(root, "openal-soft-" + version);
#endif
var buildDir = Path.Combine(root, "build-" + architecture.ToString());
SetupDirectory(buildDir, true);
// Build for iOS
RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
}
// Use separate build directory
root = Path.Combine(root, "openal-soft-" + version);
var buildDir = Path.Combine(root, "build");
// Build for iOS
SetupDirectory(buildDir, true);
RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
BuildCmake(buildDir, envVars);
var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
foreach (var file in binariesToCopy)
Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
break;
}
}
}
// Deploy license
Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
// Deploy header files
var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
foreach (var file in files)
{
Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
}
}
}
}

View File

@@ -17,40 +17,6 @@ namespace Flax.Deps.Dependencies
/// <seealso cref="Flax.Deps.Dependency" />
class PhysX : Dependency
{
/// <inheritdoc />
public override TargetPlatform[] Platforms
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetPlatform.Windows,
TargetPlatform.XboxOne,
TargetPlatform.PS4,
TargetPlatform.PS5,
TargetPlatform.XboxScarlett,
TargetPlatform.Android,
TargetPlatform.Switch,
};
case TargetPlatform.Linux:
return new[]
{
TargetPlatform.Linux,
};
case TargetPlatform.Mac:
return new[]
{
TargetPlatform.Mac,
TargetPlatform.iOS,
};
default: return new TargetPlatform[0];
}
}
}
private string root;
private string projectGenDir;
private string projectGenPath;
@@ -65,8 +31,13 @@ namespace Flax.Deps.Dependencies
if (cmakeSwitch.HasAttribute("name") && cmakeSwitch.Attributes["name"].Value == name)
{
cmakeSwitch.Attributes["value"].Value = value;
return;
}
}
var child = cmakeSwitches.OwnerDocument.CreateElement(cmakeSwitches.ChildNodes[0].Name);
child.SetAttribute("name", name);
child.SetAttribute("value", value);
cmakeSwitches.AppendChild(child);
}
private void Build(BuildOptions options, string preset, TargetPlatform targetPlatform, TargetArchitecture architecture)
@@ -94,11 +65,14 @@ namespace Flax.Deps.Dependencies
case TargetPlatform.Windows:
if (architecture == TargetArchitecture.ARM64)
{
// Windows ARM64 doesn't have GPU support, so avoid copying those DLLs around
// Windows ARM64 doesn't have precompiled files for GPU support, so avoid copying those DLLs around
ConfigureCmakeSwitch(cmakeSwitches, "PX_COPY_EXTERNAL_DLL", "OFF");
ConfigureCmakeSwitch(cmakeParams, "PX_COPY_EXTERNAL_DLL", "OFF");
}
break;
case TargetPlatform.Linux:
ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
break;
case TargetPlatform.Android:
ConfigureCmakeSwitch(cmakeParams, "CMAKE_INSTALL_PREFIX", $"install/android-{Configuration.AndroidPlatformApi}/PhysX");
ConfigureCmakeSwitch(cmakeParams, "ANDROID_NATIVE_API_LEVEL", $"android-{Configuration.AndroidPlatformApi}");
@@ -106,6 +80,7 @@ namespace Flax.Deps.Dependencies
break;
case TargetPlatform.Mac:
ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.MacOSXMinVer);
ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
break;
case TargetPlatform.iOS:
ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.iOSMinVer);
@@ -122,10 +97,11 @@ namespace Flax.Deps.Dependencies
string bits;
string arch;
string binariesSubDir;
string buildPlatform;
string buildPlatform = architecture == TargetArchitecture.x86 ? "Win32" : architecture.ToString();
bool suppressBitsPostfix = false;
string binariesPrefix = string.Empty;
var envVars = new Dictionary<string, string>();
envVars.Add("CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel);
switch (architecture)
{
case TargetArchitecture.x86:
@@ -146,15 +122,6 @@ namespace Flax.Deps.Dependencies
break;
default: throw new InvalidArchitectureException(architecture);
}
switch (architecture)
{
case TargetArchitecture.x86:
buildPlatform = "Win32";
break;
default:
buildPlatform = architecture.ToString();
break;
}
var msBuildProps = new Dictionary<string, string>();
switch (targetPlatform)
{
@@ -385,60 +352,84 @@ namespace Flax.Deps.Dependencies
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
Build(options, "vc17win64", platform, TargetArchitecture.x64);
Build(options, "vc17win-arm64", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.Linux:
{
Build(options, "linux", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.PS4:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "ps4", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.PS5:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "ps5", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.XboxScarlett:
case TargetPlatform.XboxOne:
{
Build(options, "vc16win64", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.Android:
{
Build(options, "android", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.Switch:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "switch64", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.Mac:
{
Build(options, "mac64", platform, TargetArchitecture.x64);
Build(options, "mac-arm64", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.iOS:
{
Build(options, "ios64", platform, TargetArchitecture.ARM64);
break;
}
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
if (architecture == TargetArchitecture.x64 || architecture == TargetArchitecture.ARM64)
{
if (WindowsPlatform.GetToolsets().Any(x => x.Key == WindowsPlatformToolset.v145))
{
try
{
Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture);
}
catch (Exception e)
{
Log.Warning($"Failed to generate VS2026 solution for PhysX, fallback to VS2022: {e.Message}");
Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
}
}
else
Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
}
else
throw new InvalidArchitectureException(architecture);
break;
}
case TargetPlatform.Linux:
{
Build(options, "linux", platform, architecture);
break;
}
case TargetPlatform.PS4:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "ps4", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.PS5:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "ps5", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.XboxScarlett:
case TargetPlatform.XboxOne:
{
Build(options, "vc16win64", platform, TargetArchitecture.x64);
break;
}
case TargetPlatform.Android:
{
Build(options, "android", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.Switch:
{
Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
Build(options, "switch64", platform, TargetArchitecture.ARM64);
break;
}
case TargetPlatform.Mac:
{
if (architecture == TargetArchitecture.x64)
Build(options, "mac64", platform, architecture);
else if (architecture == TargetArchitecture.ARM64)
Build(options, "mac-arm64", platform, architecture);
else
throw new InvalidArchitectureException(architecture);
break;
}
case TargetPlatform.iOS:
{
Build(options, "ios64", platform, TargetArchitecture.ARM64);
break;
}
}
}
}
@@ -446,7 +437,7 @@ namespace Flax.Deps.Dependencies
var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "PhysX");
Directory.GetFiles(dstIncludePath, "*.h", SearchOption.AllDirectories).ToList().ForEach(File.Delete);
Utilities.FileCopy(Path.Combine(root, "LICENSE.md"), Path.Combine(dstIncludePath, "License.txt"));
Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath);
Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath, true, true);
}
}
}

View File

@@ -29,6 +29,24 @@ namespace Flax.Deps.Dependencies
}
}
/// <inheritdoc />
public override TargetArchitecture[] Architectures
{
get
{
switch (BuildPlatform)
{
case TargetPlatform.Windows:
return new[]
{
TargetArchitecture.x64,
TargetArchitecture.ARM64,
};
default: return new TargetArchitecture[0];
}
}
}
/// <inheritdoc />
public override void Build(BuildOptions options)
{
@@ -47,23 +65,23 @@ namespace Flax.Deps.Dependencies
foreach (var platform in options.Platforms)
{
BuildStarted(platform);
switch (platform)
foreach (var architecture in options.Architectures)
{
case TargetPlatform.Windows:
{
// Build for Win64
foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
BuildStarted(platform, architecture);
switch (platform)
{
case TargetPlatform.Windows:
{
// Build for Windows
Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString(), new Dictionary<string, string>() { { "RestorePackagesConfig", "true" } });
var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
foreach (var file in outputFileNames)
{
Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), "Release", file), Path.Combine(depsFolder, file));
}
break;
}
}
break;
}
}
}

Some files were not shown because too many files have changed in this diff Show More