diff --git a/Source/Editor/Content/Proxy/PrefabProxy.cs b/Source/Editor/Content/Proxy/PrefabProxy.cs
index c0c4e5c88..d2971f296 100644
--- a/Source/Editor/Content/Proxy/PrefabProxy.cs
+++ b/Source/Editor/Content/Proxy/PrefabProxy.cs
@@ -73,6 +73,16 @@ namespace FlaxEditor.Content
             return targetLocation.CanHaveAssets;
         }
 
+        /// <inheritdoc />
+        public override bool CanReimport(ContentItem item)
+        {
+            if (item is not PrefabItem prefabItem)
+                return base.CanReimport(item);
+
+            var prefab = FlaxEngine.Content.Load<Prefab>(prefabItem.ID);
+            return prefab.GetDefaultInstance().GetScript<ModelPrefab>() != null;
+        }
+
         /// <inheritdoc />
         public override void Create(string outputPath, object arg)
         {
diff --git a/Source/Editor/Windows/ContentWindow.ContextMenu.cs b/Source/Editor/Windows/ContentWindow.ContextMenu.cs
index f29f97e66..182467566 100644
--- a/Source/Editor/Windows/ContentWindow.ContextMenu.cs
+++ b/Source/Editor/Windows/ContentWindow.ContextMenu.cs
@@ -309,6 +309,23 @@ namespace FlaxEditor.Windows
             {
                 if (selection[i] is BinaryAssetItem binaryAssetItem)
                     Editor.ContentImporting.Reimport(binaryAssetItem);
+                else if (selection[i] is PrefabItem prefabItem)
+                {
+                    var prefab = FlaxEngine.Content.Load<Prefab>(prefabItem.ID);
+                    var modelPrefab = prefab.GetDefaultInstance().GetScript<ModelPrefab>();
+                    if (!modelPrefab)
+                        continue;
+                    var importPath = modelPrefab.ImportPath;
+                    var editor = Editor.Instance;
+                    if (editor.ContentImporting.GetReimportPath("Model Prefab", ref importPath))
+                        continue;
+                    var folder = editor.ContentDatabase.Find(Path.GetDirectoryName(prefab.Path)) as ContentFolder;
+                    if (folder == null)
+                        continue;
+                    var importOptions = modelPrefab.ImportOptions;
+                    importOptions.Type = FlaxEngine.Tools.ModelTool.ModelType.Prefab;
+                    editor.ContentImporting.Import(importPath, folder, true, importOptions);
+                }
             }
         }
 
diff --git a/Source/Engine/Graphics/Models/ModelData.Tool.cpp b/Source/Engine/Graphics/Models/ModelData.Tool.cpp
index be6b64398..a62fa6bae 100644
--- a/Source/Engine/Graphics/Models/ModelData.Tool.cpp
+++ b/Source/Engine/Graphics/Models/ModelData.Tool.cpp
@@ -15,12 +15,12 @@
 #define USE_MIKKTSPACE 1
 #include "ThirdParty/MikkTSpace/mikktspace.h"
 #if USE_ASSIMP
-#define USE_SPARIAL_SORT 1
+#define USE_SPATIAL_SORT 1
 #define ASSIMP_BUILD_NO_EXPORT
 #include "Engine/Tools/ModelTool/SpatialSort.h"
 //#include <ThirdParty/assimp/SpatialSort.h>
 #else
-#define USE_SPARIAL_SORT 0
+#define USE_SPATIAL_SORT 0
 #endif
 #include <stack>
 
@@ -155,18 +155,18 @@ bool MeshData::GenerateLightmapUVs()
 }
 
 int32 FindVertex(const MeshData& mesh, int32 vertexIndex, int32 startIndex, int32 searchRange, const Array<int32>& mapping
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
                  , const Assimp::SpatialSort& spatialSort
-                 , std::vector<unsigned int>& sparialSortCache
+                 , std::vector<unsigned int>& spatialSortCache
 #endif
 )
 {
     const float uvEpsSqr = (1.0f / 250.0f) * (1.0f / 250.0f);
 
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
     const Float3 vPosition = mesh.Positions[vertexIndex];
-    spatialSort.FindPositions(*(aiVector3D*)&vPosition, 1e-4f, sparialSortCache);
-    if (sparialSortCache.empty())
+    spatialSort.FindPositions(*(aiVector3D*)&vPosition, 1e-5f, spatialSortCache);
+    if (spatialSortCache.empty())
         return INVALID_INDEX;
 
     const Float2 vUV = mesh.UVs.HasItems() ? mesh.UVs[vertexIndex] : Float2::Zero;
@@ -177,9 +177,9 @@ int32 FindVertex(const MeshData& mesh, int32 vertexIndex, int32 startIndex, int3
 
     const int32 end = startIndex + searchRange;
 
-    for (size_t i = 0; i < sparialSortCache.size(); i++)
+    for (size_t i = 0; i < spatialSortCache.size(); i++)
     {
-        const int32 v = sparialSortCache[i];
+        const int32 v = spatialSortCache[i];
         if (v < startIndex || v >= end)
             continue;
 #else
@@ -247,11 +247,11 @@ void MeshData::BuildIndexBuffer()
     mapping.Resize(vertexCount);
     int32 newVertexCounter = 0;
 
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
     // Set up a SpatialSort to quickly find all vertices close to a given position
     Assimp::SpatialSort vertexFinder;
     vertexFinder.Fill((const aiVector3D*)Positions.Get(), vertexCount, sizeof(Float3));
-    std::vector<unsigned int> sparialSortCache;
+    std::vector<unsigned int> spatialSortCache;
 #endif
 
     // Build index buffer
@@ -259,8 +259,8 @@ void MeshData::BuildIndexBuffer()
     {
         // Find duplicated vertex before the current one
         const int32 reuseVertexIndex = FindVertex(*this, vertexIndex, 0, vertexIndex, mapping
-#if USE_SPARIAL_SORT
-                                                  , vertexFinder, sparialSortCache
+#if USE_SPATIAL_SORT
+                                                  , vertexFinder, spatialSortCache
 #endif
         );
         if (reuseVertexIndex == INVALID_INDEX)
@@ -304,18 +304,15 @@ void MeshData::BuildIndexBuffer()
 
         dstBlendShape.Name = srcBlendShape.Name;
         dstBlendShape.Weight = srcBlendShape.Weight;
-        dstBlendShape.Vertices.Resize(newVertexCounter);
-        for (int32 i = 0, j = 0; i < srcBlendShape.Vertices.Count(); i++)
+        dstBlendShape.Vertices.EnsureCapacity(srcBlendShape.Vertices.Count());
+        for (int32 i = 0; i < srcBlendShape.Vertices.Count(); i++)
         {
-            const auto idx = mapping[i];
-            if (idx != INVALID_INDEX)
+            auto& v = srcBlendShape.Vertices[i];
+            int32 newVertexIndex = v.VertexIndex < (uint32)vertexCount ? mapping[v.VertexIndex] : INVALID_INDEX;
+            if (newVertexIndex != INVALID_INDEX)
             {
-                auto& v = srcBlendShape.Vertices[i];
-                ASSERT_LOW_LAYER(v.VertexIndex < (uint32)vertexCount);
-                ASSERT_LOW_LAYER(mapping[v.VertexIndex] != INVALID_INDEX);
-                v.VertexIndex = mapping[v.VertexIndex];
-                ASSERT_LOW_LAYER(v.VertexIndex < (uint32)newVertexCounter);
-                dstBlendShape.Vertices[j++] = v;
+                v.VertexIndex = newVertexIndex;
+                dstBlendShape.Vertices.Add(v);
             }
         }
     }
@@ -376,7 +373,7 @@ bool MeshData::GenerateNormals(float smoothingAngle)
         Float3::Max(max, v3, max);
     }
 
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
     // Set up a SpatialSort to quickly find all vertices close to a given position
     Assimp::SpatialSort vertexFinder;
     vertexFinder.Fill((const aiVector3D*)Positions.Get(), vertexCount, sizeof(Float3));
@@ -399,7 +396,7 @@ bool MeshData::GenerateNormals(float smoothingAngle)
                 continue;
 
             // Get all vertices that share this one
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
             vertexFinder.FindPositions(*(aiVector3D*)&Positions[i], posEpsilon, verticesFound);
             const int32 verticesFoundCount = (int32)verticesFound.size();
 #else
@@ -429,7 +426,7 @@ bool MeshData::GenerateNormals(float smoothingAngle)
         for (int32 i = 0; i < vertexCount; i++)
         {
             // Get all vertices that share this one
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
             vertexFinder.FindPositions(*(aiVector3D*)&Positions[i], posEpsilon, verticesFound);
             const int32 verticesFoundCount = (int32)verticesFound.size();
 #else
@@ -623,7 +620,7 @@ bool MeshData::GenerateTangents(float smoothingAngle)
         }
     }
 
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
     // Set up a SpatialSort to quickly find all vertices close to a given position
     Assimp::SpatialSort vertexFinder;
     vertexFinder.Fill((const aiVector3D*)Positions.Get(), vertexCount, sizeof(Float3));
@@ -648,7 +645,7 @@ bool MeshData::GenerateTangents(float smoothingAngle)
         closeVertices.Clear();
 
         // Find all vertices close to that position
-#if USE_SPARIAL_SORT
+#if USE_SPATIAL_SORT
         vertexFinder.FindPositions(*(aiVector3D*)&origPos, posEpsilon, verticesFound);
         const int32 verticesFoundCount = (int32)verticesFound.size();
 #else
diff --git a/Source/Engine/Tools/ModelTool/ModelTool.OpenFBX.cpp b/Source/Engine/Tools/ModelTool/ModelTool.OpenFBX.cpp
index cf124f977..dcb0b6c5f 100644
--- a/Source/Engine/Tools/ModelTool/ModelTool.OpenFBX.cpp
+++ b/Source/Engine/Tools/ModelTool/ModelTool.OpenFBX.cpp
@@ -6,6 +6,7 @@
 #include "Engine/Core/Log.h"
 #include "Engine/Core/Math/Mathd.h"
 #include "Engine/Core/Math/Matrix.h"
+#include "Engine/Core/Math/Plane.h"
 #include "Engine/Core/Collections/Sorting.h"
 #include "Engine/Platform/FileSystem.h"
 #include "Engine/Tools/TextureTool/TextureTool.h"
@@ -13,6 +14,11 @@
 #include "Engine/Platform/File.h"
 
 #define OPEN_FBX_CONVERT_SPACE 1
+#if BUILD_DEBUG
+#define OPEN_FBX_GET_CACHE_LIST(arrayName, varName, size) data.arrayName.Resize(size, false); auto& varName = data.arrayName
+#else
+#define OPEN_FBX_GET_CACHE_LIST(arrayName, varName, size) data.arrayName.Resize(size, false); auto* varName = data.arrayName.Get()
+#endif
 
 // Import OpenFBX library
 // Source: https://github.com/nem0/OpenFBX
@@ -49,7 +55,7 @@ Quaternion ToQuaternion(const ofbx::Quat& v)
     return Quaternion((float)v.x, (float)v.y, (float)v.z, (float)v.w);
 }
 
-Matrix ToMatrix(const ofbx::Matrix& mat)
+Matrix ToMatrix(const ofbx::DMatrix& mat)
 {
     Matrix result;
     for (int32 i = 0; i < 16; i++)
@@ -103,6 +109,13 @@ struct OpenFbxImporterData
     Array<const ofbx::Material*> Materials;
     Array<MaterialSlotEntry> ImportedMaterials;
 
+    Array<int> TriangulatedIndicesCache;
+    Array<Int4> BlendIndicesCache;
+    Array<Float4> BlendWeightsCache;
+    Array<Float2> TriangulatePointsCache;
+    Array<int> TriangulateIndicesCache;
+    Array<int> TriangulateEarIndicesCache;
+
     OpenFbxImporterData(const String& path, const ModelTool::Options& options, ofbx::IScene* scene)
         : Scene(scene)
         , ScenePtr(scene)
@@ -416,7 +429,7 @@ void ProcessNodes(OpenFbxImporterData& data, const ofbx::Object* aNode, int32 pa
 Matrix GetOffsetMatrix(OpenFbxImporterData& data, const ofbx::Mesh* mesh, const ofbx::Object* node)
 {
 #if 1
-    auto* skin = mesh ? mesh->getGeometry()->getSkin() : nullptr;
+    auto* skin = mesh ? mesh->getSkin() : nullptr;
     if (skin)
     {
         for (int i = 0, c = skin->getClusterCount(); i < c; i++)
@@ -445,7 +458,7 @@ Matrix GetOffsetMatrix(OpenFbxImporterData& data, const ofbx::Mesh* mesh, const
 
 bool IsMeshInvalid(const ofbx::Mesh* aMesh)
 {
-    return aMesh->getGeometry()->getVertexCount() == 0;
+    return aMesh->getGeometryData().getPositions().count == 0;
 }
 
 bool ImportBones(OpenFbxImporterData& data, String& errorMsg)
@@ -455,8 +468,7 @@ bool ImportBones(OpenFbxImporterData& data, String& errorMsg)
     for (int i = 0; i < meshCount; i++)
     {
         const auto aMesh = data.Scene->getMesh(i);
-        const auto aGeometry = aMesh->getGeometry();
-        const ofbx::Skin* skin = aGeometry->getSkin();
+        const ofbx::Skin* skin = aMesh->getSkin();
         if (skin == nullptr || IsMeshInvalid(aMesh))
             continue;
 
@@ -524,56 +536,198 @@ bool ImportBones(OpenFbxImporterData& data, String& errorMsg)
     return false;
 }
 
-bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh* aMesh, MeshData& mesh, String& errorMsg, int32 triangleStart, int32 triangleEnd)
+int Triangulate(OpenFbxImporterData& data, const ofbx::GeometryData& geom, const ofbx::GeometryPartition::Polygon& polygon, int* triangulatedIndices)
+{
+    if (polygon.vertex_count < 3)
+        return 0;
+    else if (polygon.vertex_count == 3)
+    {
+        triangulatedIndices[0] = polygon.from_vertex;
+        triangulatedIndices[1] = polygon.from_vertex + 1;
+        triangulatedIndices[2] = polygon.from_vertex + 2;
+        return 3;
+    }
+    else if (polygon.vertex_count == 4)
+    {
+        triangulatedIndices[0] = polygon.from_vertex + 0;
+        triangulatedIndices[1] = polygon.from_vertex + 1;
+        triangulatedIndices[2] = polygon.from_vertex + 2;
+        triangulatedIndices[3] = polygon.from_vertex + 0;
+        triangulatedIndices[4] = polygon.from_vertex + 2;
+        triangulatedIndices[5] = polygon.from_vertex + 3;
+        return 6;
+    }
+
+    const ofbx::Vec3Attributes& positions = geom.getPositions();
+    Float3 normal = ToFloat3(geom.getNormals().get(polygon.from_vertex));
+
+    // Check if the polygon is convex
+    int lastSign = 0;
+    bool isConvex = true;
+    for (int i = 0; i < polygon.vertex_count; i++)
+    {
+        Float3 v1 = ToFloat3(positions.get(polygon.from_vertex + i));
+        Float3 v2 = ToFloat3(positions.get(polygon.from_vertex + (i + 1) % polygon.vertex_count));
+        Float3 v3 = ToFloat3(positions.get(polygon.from_vertex + (i + 2) % polygon.vertex_count));
+
+        // The winding order of all triangles must be same for polygon to be considered convex
+        int sign;
+        Float3 c = Float3::Cross(v1 - v2, v3 - v2);
+        if (c.LengthSquared() == 0.0f)
+            continue;
+        else if (Math::NotSameSign(c.X, normal.X) || Math::NotSameSign(c.Y, normal.Y) || Math::NotSameSign(c.Z, normal.Z))
+            sign = 1;
+        else
+            sign = -1;
+        if ((sign < 0 && lastSign > 0) || (sign > 0 && lastSign < 0))
+        {
+            isConvex = false;
+            break;
+        }
+        lastSign += sign;
+    }
+
+    // Fast-path for convex case
+    if (isConvex)
+    {
+        for (int i = 0; i < polygon.vertex_count - 2; i++)
+        {
+            triangulatedIndices[i * 3 + 0] = polygon.from_vertex;
+            triangulatedIndices[i * 3 + 1] = polygon.from_vertex + (i + 1) % polygon.vertex_count;
+            triangulatedIndices[i * 3 + 2] = polygon.from_vertex + (i + 2) % polygon.vertex_count;
+        }
+        return 3 * (polygon.vertex_count - 2);
+    }
+
+    // Setup arrays for temporary data (TODO: maybe double-linked list is more optimal?)
+    auto& points = data.TriangulatePointsCache;
+    auto& indices = data.TriangulateIndicesCache;
+    auto& earIndices = data.TriangulateEarIndicesCache;
+    points.Clear();
+    indices.Clear();
+    earIndices.Clear();
+    points.EnsureCapacity(polygon.vertex_count, false);
+    indices.EnsureCapacity(polygon.vertex_count, false);
+    earIndices.EnsureCapacity(3 * (polygon.vertex_count - 2), false);
+
+    // Project points to a plane, choose two arbitrary axises
+    const Float3 u = Float3::Cross(normal, Math::Abs(normal.X) > Math::Abs(normal.Y) ? Float3::Up : Float3::Right).GetNormalized();
+    const Float3 v = Float3::Cross(normal, u).GetNormalized();
+    for (int i = 0; i < polygon.vertex_count; i++)
+    {
+        const Float3 point = ToFloat3(positions.get(polygon.from_vertex + i));
+        const Float3 projectedPoint = Float3::ProjectOnPlane(point, normal);
+        const Float2 pointOnPlane = Float2(
+            projectedPoint.X * u.X + projectedPoint.Y * u.Y + projectedPoint.Z * u.Z,
+            projectedPoint.X * v.X + projectedPoint.Y * v.Y + projectedPoint.Z * v.Z);
+
+        points.Add(pointOnPlane);
+        indices.Add(i);
+    }
+
+    // Triangulate non-convex polygons using simple ear-clipping algorithm (https://nils-olovsson.se/articles/ear_clipping_triangulation/)
+    const int maxIterations = indices.Count() * 10; // Safe guard to prevent infinite loop
+    int index = 0;
+    while (indices.Count() > 3 && index < maxIterations)
+    {
+        const int i1 = index % indices.Count();
+        const int i2 = (index + 1) % indices.Count();
+        const int i3 = (index + 2) % indices.Count();
+        const Float2 p1 = points[indices[i1]];
+        const Float2 p2 = points[indices[i2]];
+        const Float2 p3 = points[indices[i3]];
+
+        // TODO: Skip triangles with very sharp angles?
+
+        // Skip reflex vertices
+        if (Float2::Cross(p2 - p1, p3 - p1) < 0.0f)
+        {
+            index++;
+            continue;
+        }
+
+        // The triangle is considered to be an "ear" when no other points reside inside the triangle
+        bool isEar = true;
+        for (int j = 0; j < indices.Count(); j++)
+        {
+            if (j == i1 || j == i2 || j == i3)
+                continue;
+            const Float2 candidate = points[indices[j]];
+            if (CollisionsHelper::IsPointInTriangle(candidate, p1, p2, p3))
+            {
+                isEar = false;
+                break;
+            }
+        }
+        if (!isEar)
+        {
+            index++;
+            continue;
+        }
+
+        // Add an ear and remove the tip point from evaluation
+        earIndices.Add(indices[i1]);
+        earIndices.Add(indices[i2]);
+        earIndices.Add(indices[i3]);
+        indices.RemoveAtKeepOrder(i2);
+    }
+
+    for (int i = 0; i < earIndices.Count(); i++)
+        triangulatedIndices[i] = polygon.from_vertex + (earIndices[i] % polygon.vertex_count);
+    triangulatedIndices[earIndices.Count() + 0] = polygon.from_vertex + (indices[0] % polygon.vertex_count);
+    triangulatedIndices[earIndices.Count() + 1] = polygon.from_vertex + (indices[1] % polygon.vertex_count);
+    triangulatedIndices[earIndices.Count() + 2] = polygon.from_vertex + (indices[2] % polygon.vertex_count);
+
+    return 3 * (polygon.vertex_count - 2);
+}
+
+bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh* aMesh, MeshData& mesh, String& errorMsg, int partitionIndex)
 {
     PROFILE_CPU();
     mesh.Name = aMesh->name;
     ZoneText(*mesh.Name, mesh.Name.Length());
-    const int32 firstVertexOffset = triangleStart * 3;
-    const int32 lastVertexOffset = triangleEnd * 3;
-    const ofbx::Geometry* aGeometry = aMesh->getGeometry();
-    const int vertexCount = lastVertexOffset - firstVertexOffset + 3;
-    ASSERT(firstVertexOffset + vertexCount <= aGeometry->getVertexCount());
-    const ofbx::Vec3* vertices = aGeometry->getVertices();
-    const ofbx::Vec3* normals = aGeometry->getNormals();
-    const ofbx::Vec3* tangents = aGeometry->getTangents();
-    const ofbx::Vec4* colors = aGeometry->getColors();
-    const ofbx::Vec2* uvs = aGeometry->getUVs();
-    const ofbx::Skin* skin = aGeometry->getSkin();
-    const ofbx::BlendShape* blendShape = aGeometry->getBlendShape();
+    const ofbx::GeometryData& geometryData = aMesh->getGeometryData();
+    const ofbx::GeometryPartition& partition = geometryData.getPartition(partitionIndex);
+    const int vertexCount = partition.triangles_count * 3;
+    const ofbx::Vec3Attributes& positions = geometryData.getPositions();
+    const ofbx::Vec2Attributes& uvs = geometryData.getUVs();
+    const ofbx::Vec3Attributes& normals = geometryData.getNormals();
+    const ofbx::Vec3Attributes& tangents = geometryData.getTangents();
+    const ofbx::Vec4Attributes& colors = geometryData.getColors();
+    const ofbx::Skin* skin = aMesh->getSkin();
+    const ofbx::BlendShape* blendShape = aMesh->getBlendShape();
+    OPEN_FBX_GET_CACHE_LIST(TriangulatedIndicesCache, triangulatedIndices, vertexCount);
 
     // Properties
     const ofbx::Material* aMaterial = nullptr;
     if (aMesh->getMaterialCount() > 0)
-    {
-        if (aGeometry->getMaterials())
-            aMaterial = aMesh->getMaterial(aGeometry->getMaterials()[triangleStart]);
-        else
-            aMaterial = aMesh->getMaterial(0);
-    }
+        aMaterial = aMesh->getMaterial(partitionIndex);
     mesh.MaterialSlotIndex = data.AddMaterial(result, aMaterial);
 
     // Vertex positions
     mesh.Positions.Resize(vertexCount, false);
-    for (int i = 0; i < vertexCount; i++)
-        mesh.Positions.Get()[i] = ToFloat3(vertices[i + firstVertexOffset]);
+    {
+        int numIndicesTotal = 0;
+        for (int i = 0; i < partition.polygon_count; i++)
+        {
+            int numIndices = Triangulate(data, geometryData, partition.polygons[i], &triangulatedIndices[numIndicesTotal]);
+            for (int j = numIndicesTotal; j < numIndicesTotal + numIndices; j++)
+                mesh.Positions.Get()[j] = ToFloat3(positions.get(triangulatedIndices[j]));
+            numIndicesTotal += numIndices;
+        }
+    }
 
     // Indices (dummy index buffer)
-    if (vertexCount % 3 != 0)
-    {
-        errorMsg = TEXT("Invalid vertex count. It must be multiple of 3.");
-        return true;
-    }
     mesh.Indices.Resize(vertexCount, false);
     for (int i = 0; i < vertexCount; i++)
         mesh.Indices.Get()[i] = i;
 
     // Texture coordinates
-    if (uvs)
+    if (uvs.values)
     {
         mesh.UVs.Resize(vertexCount, false);
         for (int i = 0; i < vertexCount; i++)
-            mesh.UVs.Get()[i] = ToFloat2(uvs[i + firstVertexOffset]);
+            mesh.UVs.Get()[i] = ToFloat2(uvs.get(triangulatedIndices[i]));
         if (data.ConvertRH)
         {
             for (int32 v = 0; v < vertexCount; v++)
@@ -582,7 +736,7 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
     }
 
     // Normals
-    if (data.Options.CalculateNormals || !normals)
+    if (data.Options.CalculateNormals || !normals.values)
     {
         if (mesh.GenerateNormals(data.Options.SmoothingNormalsAngle))
         {
@@ -590,11 +744,11 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
             return true;
         }
     }
-    else if (normals)
+    else if (normals.values)
     {
         mesh.Normals.Resize(vertexCount, false);
         for (int i = 0; i < vertexCount; i++)
-            mesh.Normals.Get()[i] = ToFloat3(normals[i + firstVertexOffset]);
+            mesh.Normals.Get()[i] = ToFloat3(normals.get(triangulatedIndices[i]));
         if (data.ConvertRH)
         {
             // Mirror normals along the Z axis
@@ -604,15 +758,15 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
     }
 
     // Tangents
-    if ((data.Options.CalculateTangents || !tangents) && mesh.UVs.HasItems())
+    if ((data.Options.CalculateTangents || !tangents.values) && mesh.UVs.HasItems())
     {
         // Generated after full mesh data conversion
     }
-    else if (tangents)
+    else if (tangents.values)
     {
         mesh.Tangents.Resize(vertexCount, false);
         for (int i = 0; i < vertexCount; i++)
-            mesh.Tangents.Get()[i] = ToFloat3(tangents[i + firstVertexOffset]);
+            mesh.Tangents.Get()[i] = ToFloat3(tangents.get(triangulatedIndices[i]));
         if (data.ConvertRH)
         {
             // Mirror tangents along the Z axis
@@ -658,12 +812,12 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
         }
 
         // Check if has that channel texcoords
-        const auto lightmapUVs = aGeometry->getUVs(inputChannelIndex);
-        if (lightmapUVs)
+        const auto lightmapUVs = geometryData.getUVs(inputChannelIndex);
+        if (lightmapUVs.values)
         {
             mesh.LightmapUVs.Resize(vertexCount, false);
             for (int i = 0; i < vertexCount; i++)
-                mesh.LightmapUVs.Get()[i] = ToFloat2(lightmapUVs[i + firstVertexOffset]);
+                mesh.LightmapUVs.Get()[i] = ToFloat2(lightmapUVs.get(triangulatedIndices[i]));
             if (data.ConvertRH)
             {
                 for (int32 v = 0; v < vertexCount; v++)
@@ -677,20 +831,20 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
     }
 
     // Vertex Colors
-    if (data.Options.ImportVertexColors && colors)
+    if (data.Options.ImportVertexColors && colors.values)
     {
         mesh.Colors.Resize(vertexCount, false);
         for (int i = 0; i < vertexCount; i++)
-            mesh.Colors.Get()[i] = ToColor(colors[i + firstVertexOffset]);
+            mesh.Colors.Get()[i] = ToColor(colors.get(triangulatedIndices[i]));
     }
 
     // Blend Indices and Blend Weights
     if (skin && skin->getClusterCount() > 0 && EnumHasAnyFlags(data.Options.ImportTypes, ImportDataTypes::Skeleton))
     {
-        mesh.BlendIndices.Resize(vertexCount);
-        mesh.BlendWeights.Resize(vertexCount);
-        mesh.BlendIndices.SetAll(Int4::Zero);
-        mesh.BlendWeights.SetAll(Float4::Zero);
+        OPEN_FBX_GET_CACHE_LIST(BlendIndicesCache, blendIndices, positions.values_count);
+        OPEN_FBX_GET_CACHE_LIST(BlendWeightsCache, blendWeights, positions.values_count);
+        data.BlendIndicesCache.SetAll(Int4::Zero);
+        data.BlendWeightsCache.SetAll(Float4::Zero);
 
         for (int clusterIndex = 0, clusterCount = skin->getClusterCount(); clusterIndex < clusterCount; clusterIndex++)
         {
@@ -718,12 +872,12 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
             const double* clusterWeights = cluster->getWeights();
             for (int j = 0; j < cluster->getIndicesCount(); j++)
             {
-                int vtxIndex = clusterIndices[j] - firstVertexOffset;
+                int vtxIndex = clusterIndices[j];
                 float vtxWeight = (float)clusterWeights[j];
-                if (vtxWeight <= 0 || vtxIndex < 0 || vtxIndex >= vertexCount)
+                if (vtxWeight <= 0 || vtxIndex < 0 || vtxIndex >= positions.values_count)
                     continue;
-                Int4& indices = mesh.BlendIndices.Get()[vtxIndex];
-                Float4& weights = mesh.BlendWeights.Get()[vtxIndex];
+                Int4& indices = blendIndices[vtxIndex];
+                Float4& weights = blendWeights[vtxIndex];
 
                 for (int32 k = 0; k < 4; k++)
                 {
@@ -745,6 +899,16 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
             }
         }
 
+        // Remap blend values to triangulated data
+        mesh.BlendIndices.Resize(vertexCount, false);
+        mesh.BlendWeights.Resize(vertexCount, false);
+        for (int i = 0; i < vertexCount; i++)
+        {
+            const int idx = positions.indices[triangulatedIndices[i]];
+            mesh.BlendIndices.Get()[i] = blendIndices[idx];
+            mesh.BlendWeights.Get()[i] = blendWeights[idx];
+        }
+
         mesh.NormalizeBlendWeights();
     }
 
@@ -756,44 +920,43 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
         {
             const ofbx::BlendShapeChannel* channel = blendShape->getBlendShapeChannel(channelIndex);
 
-            // Use last shape
+            // Use the last shape
             const int targetShapeCount = channel->getShapeCount();
             if (targetShapeCount == 0)
                 continue;
             const ofbx::Shape* shape = channel->getShape(targetShapeCount - 1);
-
-            if (shape->getVertexCount() != aGeometry->getVertexCount())
+            const ofbx::Vec3* shapeVertices = shape->getVertices();
+            const ofbx::Vec3* shapeNormals = shape->getNormals();
+            const int* shapeIndices = shape->getIndices();
+            const int shapeVertexCount = shape->getVertexCount();
+            const int shapeIndexCount = shape->getIndexCount();
+            if (shapeVertexCount != shapeIndexCount)
             {
-                LOG(Error, "Blend shape '{0}' in mesh '{1}' has different amount of vertices ({2}) than mesh ({3})", String(shape->name), mesh.Name, shape->getVertexCount(), aGeometry->getVertexCount());
+                LOG(Error, "Blend shape '{0}' in mesh '{1}' has different amount of vertices ({2}) and indices ({3})", String(shape->name), mesh.Name, shapeVertexCount, shapeIndexCount);
                 continue;
             }
 
             BlendShape& blendShapeData = mesh.BlendShapes.AddOne();
             blendShapeData.Name = shape->name;
             blendShapeData.Weight = channel->getShapeCount() > 1 ? (float)(channel->getDeformPercent() / 100.0) : 1.0f;
+            blendShapeData.Vertices.EnsureCapacity(shapeIndexCount);
 
-            blendShapeData.Vertices.Resize(vertexCount);
-            for (int32 i = 0; i < blendShapeData.Vertices.Count(); i++)
-                blendShapeData.Vertices.Get()[i].VertexIndex = i;
-
-            auto shapeVertices = shape->getVertices();
-            for (int32 i = 0; i < blendShapeData.Vertices.Count(); i++)
+            for (int32 i = 0; i < shapeIndexCount; i++)
             {
-                auto delta = ToFloat3(shapeVertices[i + firstVertexOffset]) - mesh.Positions.Get()[i];
-                blendShapeData.Vertices.Get()[i].PositionDelta = delta;
-            }
-
-            auto shapeNormals = shape->getNormals();
-            for (int32 i = 0; i < blendShapeData.Vertices.Count(); i++)
-            {
-                auto delta = ToFloat3(shapeNormals[i + firstVertexOffset]);
-                if (data.ConvertRH)
+		        int shapeIndex = shapeIndices[i];
+                BlendShapeVertex v;
+                v.PositionDelta = ToFloat3(shapeVertices[i]);
+                v.NormalDelta = shapeNormals ? ToFloat3(shapeNormals[i]) : Float3::Zero;
+                for (int32 vertexIndex = 0; vertexIndex < vertexCount; vertexIndex++)
                 {
-                    // Mirror normals along the Z axis
-                    delta.Z *= -1.0f;
+                    int sourceIndex = positions.indices[triangulatedIndices[vertexIndex]];
+                    if (sourceIndex == shapeIndex)
+                    {
+                        // Add blend shape vertex
+                        v.VertexIndex = vertexIndex;
+                        blendShapeData.Vertices.Add(v);
+                    }
                 }
-                delta = delta - mesh.Normals.Get()[i];
-                blendShapeData.Vertices.Get()[i].NormalDelta = delta;
             }
         }
     }
@@ -806,7 +969,10 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
         for (auto& blendShapeData : mesh.BlendShapes)
         {
             for (auto& v : blendShapeData.Vertices)
+            {
                 v.PositionDelta.Z *= -1.0f;
+                v.NormalDelta.Z *= -1.0f;
+            }
         }
     }
 
@@ -820,7 +986,7 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
             Swap(mesh.Indices.Get()[i], mesh.Indices.Get()[i + 2]);
     }
 
-    if ((data.Options.CalculateTangents || !tangents) && mesh.UVs.HasItems())
+    if ((data.Options.CalculateTangents || !tangents.values) && mesh.UVs.HasItems())
     {
         if (mesh.GenerateTangents(data.Options.SmoothingTangentsAngle))
         {
@@ -858,7 +1024,7 @@ bool ProcessMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
     return false;
 }
 
-bool ImportMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh* aMesh, String& errorMsg, int32 triangleStart, int32 triangleEnd)
+bool ImportMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh* aMesh, String& errorMsg, int partitionIndex)
 {
     PROFILE_CPU();
 
@@ -899,7 +1065,7 @@ bool ImportMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
 
     // Import mesh data
     MeshData* meshData = New<MeshData>();
-    if (ProcessMesh(result, data, aMesh, *meshData, errorMsg, triangleStart, triangleEnd))
+    if (ProcessMesh(result, data, aMesh, *meshData, errorMsg, partitionIndex))
         return true;
 
     // Link mesh
@@ -916,36 +1082,17 @@ bool ImportMesh(ModelData& result, OpenFbxImporterData& data, const ofbx::Mesh*
 bool ImportMesh(int32 index, ModelData& result, OpenFbxImporterData& data, String& errorMsg)
 {
     const auto aMesh = data.Scene->getMesh(index);
-    const auto aGeometry = aMesh->getGeometry();
-    const auto trianglesCount = aGeometry->getVertexCount() / 3;
     if (IsMeshInvalid(aMesh))
         return false;
 
-    if (aMesh->getMaterialCount() < 2 || !aGeometry->getMaterials())
+    const auto& geomData = aMesh->getGeometryData();
+    for (int i = 0; i < geomData.getPartitionCount(); i++)
     {
-        // Fast path if mesh is using single material for all triangles
-        if (ImportMesh(result, data, aMesh, errorMsg, 0, trianglesCount - 1))
-            return true;
-    }
-    else
-    {
-        // Create mesh for each sequence of triangles that share the same material
-        const auto materials = aGeometry->getMaterials();
-        int32 rangeStart = 0;
-        int32 rangeStartVal = materials[rangeStart];
-        for (int32 triangleIndex = 1; triangleIndex < trianglesCount; triangleIndex++)
-        {
-            if (rangeStartVal != materials[triangleIndex])
-            {
-                if (ImportMesh(result, data, aMesh, errorMsg, rangeStart, triangleIndex - 1))
-                    return true;
+        const auto& partition = geomData.getPartition(i);
+        if (partition.polygon_count == 0)
+            continue;
 
-                // Start a new range
-                rangeStart = triangleIndex;
-                rangeStartVal = materials[triangleIndex];
-            }
-        }
-        if (ImportMesh(result, data, aMesh, errorMsg, rangeStart, trianglesCount - 1))
+        if (ImportMesh(result, data, aMesh, errorMsg, i))
             return true;
     }
     return false;
@@ -962,35 +1109,35 @@ struct AnimInfo
 
 struct Frame
 {
-    ofbx::Vec3 Translation;
-    ofbx::Vec3 Rotation;
-    ofbx::Vec3 Scaling;
+    ofbx::DVec3 Translation;
+    ofbx::DVec3 Rotation;
+    ofbx::DVec3 Scaling;
 };
 
-void ExtractKeyframePosition(const ofbx::Object* bone, ofbx::Vec3& trans, const Frame& localFrame, Float3& keyframe)
+void ExtractKeyframePosition(const ofbx::Object* bone, ofbx::DVec3& trans, const Frame& localFrame, Float3& keyframe)
 {
     const Matrix frameTrans = ToMatrix(bone->evalLocal(trans, localFrame.Rotation, localFrame.Scaling));
     keyframe = frameTrans.GetTranslation();
 }
 
-void ExtractKeyframeRotation(const ofbx::Object* bone, ofbx::Vec3& trans, const Frame& localFrame, Quaternion& keyframe)
+void ExtractKeyframeRotation(const ofbx::Object* bone, ofbx::DVec3& trans, const Frame& localFrame, Quaternion& keyframe)
 {
     const Matrix frameTrans = ToMatrix(bone->evalLocal(localFrame.Translation, trans, { 1.0, 1.0, 1.0 }));
     Quaternion::RotationMatrix(frameTrans, keyframe);
 }
 
-void ExtractKeyframeScale(const ofbx::Object* bone, ofbx::Vec3& trans, const Frame& localFrame, Float3& keyframe)
+void ExtractKeyframeScale(const ofbx::Object* bone, ofbx::DVec3& trans, const Frame& localFrame, Float3& keyframe)
 {
     // Fix empty scale case
     if (Math::IsZero(trans.x) && Math::IsZero(trans.y) && Math::IsZero(trans.z))
         trans = { 1.0, 1.0, 1.0 };
 
-    const Matrix frameTrans = ToMatrix(bone->evalLocal(localFrame.Translation, localFrame.Rotation, trans));
+    const Matrix frameTrans = ToMatrix(bone->evalLocal(localFrame.Translation, { 0.0, 0.0, 0.0 }, trans));
     keyframe = frameTrans.GetScaleVector();
 }
 
 template<typename T>
-void ImportCurve(const ofbx::AnimationCurveNode* curveNode, LinearCurve<T>& curve, AnimInfo& info, void (*ExtractKeyframe)(const ofbx::Object*, ofbx::Vec3&, const Frame&, T&))
+void ImportCurve(const ofbx::AnimationCurveNode* curveNode, LinearCurve<T>& curve, AnimInfo& info, void (*ExtractKeyframe)(const ofbx::Object*, ofbx::DVec3&, const Frame&, T&))
 {
     if (curveNode == nullptr)
         return;
@@ -1008,7 +1155,7 @@ void ImportCurve(const ofbx::AnimationCurveNode* curveNode, LinearCurve<T>& curv
 
         key.Time = (float)i;
 
-        ofbx::Vec3 trans = curveNode->getNodeLocalTransform(t);
+        ofbx::DVec3 trans = curveNode->getNodeLocalTransform(t);
         ExtractKeyframe(bone, trans, localFrame, key.Value);
     }
 }
@@ -1125,21 +1272,26 @@ bool ModelTool::ImportDataOpenFBX(const String& path, ModelData& data, Options&
         errorMsg = TEXT("Cannot load file.");
         return true;
     }
-    ofbx::u64 loadFlags = 0;
+    ofbx::LoadFlags loadFlags = ofbx::LoadFlags::NONE;
     if (EnumHasAnyFlags(options.ImportTypes, ImportDataTypes::Geometry))
     {
-        loadFlags |= (ofbx::u64)ofbx::LoadFlags::TRIANGULATE;
         if (!options.ImportBlendShapes)
-            loadFlags |= (ofbx::u64)ofbx::LoadFlags::IGNORE_BLEND_SHAPES;
+            loadFlags |= ofbx::LoadFlags::IGNORE_BLEND_SHAPES;
     }
     else
     {
-        loadFlags |= (ofbx::u64)ofbx::LoadFlags::IGNORE_GEOMETRY | (ofbx::u64)ofbx::LoadFlags::IGNORE_BLEND_SHAPES;
+        loadFlags |= ofbx::LoadFlags::IGNORE_GEOMETRY | ofbx::LoadFlags::IGNORE_BLEND_SHAPES;
     }
+    if (EnumHasNoneFlags(options.ImportTypes, ImportDataTypes::Materials))
+        loadFlags |= ofbx::LoadFlags::IGNORE_MATERIALS;
+    if (EnumHasNoneFlags(options.ImportTypes, ImportDataTypes::Textures))
+        loadFlags |= ofbx::LoadFlags::IGNORE_TEXTURES;
+    if (EnumHasNoneFlags(options.ImportTypes, ImportDataTypes::Animations))
+        loadFlags |= ofbx::LoadFlags::IGNORE_ANIMATIONS;
     ofbx::IScene* scene;
     {
         PROFILE_CPU_NAMED("ofbx::load");
-        scene = ofbx::load(fileData.Get(), fileData.Count(), loadFlags);
+        scene = ofbx::load(fileData.Get(), fileData.Count(), (ofbx::u16)loadFlags);
     }
     if (!scene)
     {
diff --git a/Source/Engine/Tools/ModelTool/ModelTool.cpp b/Source/Engine/Tools/ModelTool/ModelTool.cpp
index bfbd9421b..296a21827 100644
--- a/Source/Engine/Tools/ModelTool/ModelTool.cpp
+++ b/Source/Engine/Tools/ModelTool/ModelTool.cpp
@@ -1024,31 +1024,31 @@ bool ModelTool::ImportModel(const String& path, ModelData& data, Options& option
                 mesh->BlendIndices.SetAll(indices);
                 mesh->BlendWeights.SetAll(weights);
             }
-#if BUILD_DEBUG
             else
             {
                 auto& indices = mesh->BlendIndices;
                 for (int32 j = 0; j < indices.Count(); j++)
                 {
-                    const int32 min = indices[j].MinValue();
-                    const int32 max = indices[j].MaxValue();
+                    const Int4 ij = indices.Get()[j];
+                    const int32 min = ij.MinValue();
+                    const int32 max = ij.MaxValue();
                     if (min < 0 || max >= data.Skeleton.Bones.Count())
                     {
                         LOG(Warning, "Imported mesh \'{0}\' has invalid blend indices. It may result in invalid rendering.", mesh->Name);
+                        break;
                     }
                 }
-
                 auto& weights = mesh->BlendWeights;
                 for (int32 j = 0; j < weights.Count(); j++)
                 {
-                    const float sum = weights[j].SumValues();
+                    const float sum = weights.Get()[j].SumValues();
                     if (Math::Abs(sum - 1.0f) > ZeroTolerance)
                     {
                         LOG(Warning, "Imported mesh \'{0}\' has invalid blend weights. It may result in invalid rendering.", mesh->Name);
+                        break;
                     }
                 }
             }
-#endif
         }
     }
     if (EnumHasAnyFlags(options.ImportTypes, ImportDataTypes::Animations))
diff --git a/Source/ThirdParty/OpenFBX/libdeflate.cpp b/Source/ThirdParty/OpenFBX/libdeflate.cpp
new file mode 100644
index 000000000..2e2d5355d
--- /dev/null
+++ b/Source/ThirdParty/OpenFBX/libdeflate.cpp
@@ -0,0 +1,4193 @@
+// ofbx changes : removed unused code, single .h and .c
+/*
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a highly optimized DEFLATE decompressor.  It is much faster than
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
+ *
+ * Why this is faster than vanilla zlib:
+ *
+ * - Word accesses rather than byte accesses when reading input
+ * - Word accesses rather than byte accesses when copying matches
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
+ * - Other optimizations to remove unnecessary branches
+ * - Only full-buffer decompression is supported, so the code doesn't need to
+ *   support stopping and resuming decompression.
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
+ *   instructions enabled and is used automatically at runtime when supported.
+ */
+
+/*
+ * lib_common.h - internal header included by all library code
+ */
+
+#ifndef LIB_LIB_COMMON_H
+#define LIB_LIB_COMMON_H
+
+#ifdef LIBDEFLATE_H
+ /*
+  * When building the library, LIBDEFLATEAPI needs to be defined properly before
+  * including libdeflate.h.
+  */
+#  error "lib_common.h must always be included before libdeflate.h"
+#endif
+
+#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#  define LIBDEFLATE_EXPORT_SYM  __declspec(dllexport)
+#elif defined(__GNUC__)
+#  define LIBDEFLATE_EXPORT_SYM  __attribute__((visibility("default")))
+#else
+#  define LIBDEFLATE_EXPORT_SYM
+#endif
+
+/*
+ * On i386, gcc assumes that the stack is 16-byte aligned at function entry.
+ * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi)
+ * only guarantee 4-byte alignment when calling functions.  This is mainly an
+ * issue on Windows, but it has been seen on Linux too.  Work around this ABI
+ * incompatibility by realigning the stack pointer when entering libdeflate.
+ * This prevents crashes in SSE/AVX code.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+#  define LIBDEFLATE_ALIGN_STACK  __attribute__((force_align_arg_pointer))
+#else
+#  define LIBDEFLATE_ALIGN_STACK
+#endif
+
+#define LIBDEFLATEAPI	LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
+
+/*
+ * common_defs.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COMMON_DEFS_H
+#define COMMON_DEFS_H
+
+#include "libdeflate.h"
+
+#include <stdbool.h>
+#include <stddef.h>	/* for size_t */
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>	/* for _BitScan*() and other intrinsics */
+#  include <stdlib.h>	/* for _byteswap_*() */
+   /* Disable MSVC warnings that are expected. */
+   /* /W2 */
+#  pragma warning(disable : 4146) /* unary minus on unsigned type */
+   /* /W3 */
+#  pragma warning(disable : 4018) /* signed/unsigned mismatch */
+#  pragma warning(disable : 4244) /* possible loss of data */
+#  pragma warning(disable : 4267) /* possible loss of precision */
+#  pragma warning(disable : 4310) /* cast truncates constant value */
+   /* /W4 */
+#  pragma warning(disable : 4100) /* unreferenced formal parameter */
+#  pragma warning(disable : 4127) /* conditional expression is constant */
+#  pragma warning(disable : 4189) /* local variable initialized but not referenced */
+#  pragma warning(disable : 4232) /* nonstandard extension used */
+#  pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */
+#  pragma warning(disable : 4295) /* array too small to include terminating null */
+#endif
+#ifndef FREESTANDING
+#  include <string.h>	/* for memcpy() */
+#endif
+
+/* ========================================================================== */
+/*                             Target architecture                            */
+/* ========================================================================== */
+
+/* If possible, define a compiler-independent ARCH_* macro. */
+#undef ARCH_X86_64
+#undef ARCH_X86_32
+#undef ARCH_ARM64
+#undef ARCH_ARM32
+#ifdef _MSC_VER
+#  if defined(_M_X64)
+#    define ARCH_X86_64
+#  elif defined(_M_IX86)
+#    define ARCH_X86_32
+#  elif defined(_M_ARM64)
+#    define ARCH_ARM64
+#  elif defined(_M_ARM)
+#    define ARCH_ARM32
+#  endif
+#else
+#  if defined(__x86_64__)
+#    define ARCH_X86_64
+#  elif defined(__i386__)
+#    define ARCH_X86_32
+#  elif defined(__aarch64__)
+#    define ARCH_ARM64
+#  elif defined(__arm__)
+#    define ARCH_ARM32
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                              Type definitions                              */
+/* ========================================================================== */
+
+/* Fixed-width integer types */
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+/* ssize_t, if not available in <sys/types.h> */
+#ifdef _MSC_VER
+#  ifdef _WIN64
+     typedef long long ssize_t;
+#  else
+     typedef long ssize_t;
+#  endif
+#endif
+
+/*
+ * Word type of the target architecture.  Use 'size_t' instead of
+ * 'unsigned long' to account for platforms such as Windows that use 32-bit
+ * 'unsigned long' on 64-bit architectures.
+ */
+typedef size_t machine_word_t;
+
+/* Number of bytes in a word */
+#define WORDBYTES	((int)sizeof(machine_word_t))
+
+/* Number of bits in a word */
+#define WORDBITS	(8 * WORDBYTES)
+
+/* ========================================================================== */
+/*                         Optional compiler features                         */
+/* ========================================================================== */
+
+/* Compiler version checks.  Only use when absolutely necessary. */
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define GCC_PREREQ(major, minor)		\
+	(__GNUC__ > (major) ||			\
+	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#  define GCC_PREREQ(major, minor)	0
+#endif
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__clang_major__ > (major) ||			\
+	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)	0
+#endif
+
+/*
+ * Macros to check for compiler support for attributes and builtins.  clang
+ * implements these macros, but gcc doesn't, so generally any use of one of
+ * these macros must also be combined with a gcc version check.
+ */
+#ifndef __has_attribute
+#  define __has_attribute(attribute)	0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)	0
+#endif
+
+/* inline - suggest that a function be inlined */
+#ifdef _MSC_VER
+#  define inline		__inline
+#endif /* else assume 'inline' is usable as-is */
+
+/* forceinline - force a function to be inlined, if possible */
+#if defined(__GNUC__) || __has_attribute(always_inline)
+#  define forceinline		inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define forceinline		__forceinline
+#else
+#  define forceinline		inline
+#endif
+
+/* MAYBE_UNUSED - mark a function or variable as maybe unused */
+#if defined(__GNUC__) || __has_attribute(unused)
+#  define MAYBE_UNUSED		__attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+/*
+ * restrict - hint that writes only occur through the given pointer.
+ *
+ * Don't use MSVC's __restrict, since it has nonstandard behavior.
+ * Standard restrict is okay, if it is supported.
+ */
+#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define restrict		__restrict__
+#  else
+#    define restrict
+#  endif
+#endif /* else assume 'restrict' is usable as-is */
+
+/* likely(expr) - hint that an expression is usually true */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define likely(expr)		__builtin_expect(!!(expr), 1)
+#else
+#  define likely(expr)		(expr)
+#endif
+
+/* unlikely(expr) - hint that an expression is usually false */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define unlikely(expr)	__builtin_expect(!!(expr), 0)
+#else
+#  define unlikely(expr)	(expr)
+#endif
+
+/* prefetchr(addr) - prefetch into L1 cache for read */
+#undef prefetchr
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchr(addr)	__builtin_prefetch((addr), 0)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchr(addr)	_mm_prefetch((addr), _MM_HINT_T0)
+#  elif defined(ARCH_ARM64)
+#    define prefetchr(addr)	__prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchr(addr)	__prefetch(addr)
+#  endif
+#endif
+#ifndef prefetchr
+#  define prefetchr(addr)
+#endif
+
+/* prefetchw(addr) - prefetch into L1 cache for write */
+#undef prefetchw
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchw(addr)	__builtin_prefetch((addr), 1)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchw(addr)	_m_prefetchw(addr)
+#  elif defined(ARCH_ARM64)
+#    define prefetchw(addr)	__prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchw(addr)	__prefetchw(addr)
+#  endif
+#endif
+#ifndef prefetchw
+#  define prefetchw(addr)
+#endif
+
+/*
+ * _aligned_attribute(n) - declare that the annotated variable, or variables of
+ * the annotated type, must be aligned on n-byte boundaries.
+ */
+#undef _aligned_attribute
+#if defined(__GNUC__) || __has_attribute(aligned)
+#  define _aligned_attribute(n)	__attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#  define _aligned_attribute(n)	__declspec(align(n))
+#endif
+
+/*
+ * _target_attribute(attrs) - override the compilation target for a function.
+ *
+ * This accepts one or more comma-separated suffixes to the -m prefix jointly
+ * forming the name of a machine-dependent option.  On gcc-like compilers, this
+ * enables codegen for the given targets, including arbitrary compiler-generated
+ * code as well as the corresponding intrinsics.  On other compilers this macro
+ * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
+ */
+#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#  define _target_attribute(attrs)	__attribute__((target(attrs)))
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
+#else
+#  define _target_attribute(attrs)
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
+#endif
+
+/* ========================================================================== */
+/*                          Miscellaneous macros                              */
+/* ========================================================================== */
+
+#define ARRAY_LEN(A)		(sizeof(A) / sizeof((A)[0]))
+#define MIN(a, b)		((a) <= (b) ? (a) : (b))
+#define MAX(a, b)		((a) >= (b) ? (a) : (b))
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
+#define ALIGN(n, a)		(((n) + (a) - 1) & ~((a) - 1))
+#define ROUND_UP(n, d)		((d) * DIV_ROUND_UP((n), (d)))
+
+/* ========================================================================== */
+/*                           Endianness handling                              */
+/* ========================================================================== */
+
+/*
+ * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big
+ * endian.  When possible this is a compile-time macro that can be used in
+ * preprocessor conditionals.  As a fallback, a generic method is used that
+ * can't be used in preprocessor conditionals but should still be optimized out.
+ */
+#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */
+#  define CPU_IS_LITTLE_ENDIAN()  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#elif defined(_MSC_VER)
+#  define CPU_IS_LITTLE_ENDIAN()  true
+#else
+static forceinline bool CPU_IS_LITTLE_ENDIAN(void)
+{
+	union {
+		u32 w;
+		u8 b;
+	} u;
+
+	u.w = 1;
+	return u.b;
+}
+#endif
+
+/* bswap16(v) - swap the bytes of a 16-bit integer */
+static forceinline u16 bswap16(u16 v)
+{
+#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+	return __builtin_bswap16(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ushort(v);
+#else
+	return (v << 8) | (v >> 8);
+#endif
+}
+
+/* bswap32(v) - swap the bytes of a 32-bit integer */
+static forceinline u32 bswap32(u32 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+	return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#else
+	return ((v & 0x000000FF) << 24) |
+	       ((v & 0x0000FF00) << 8) |
+	       ((v & 0x00FF0000) >> 8) |
+	       ((v & 0xFF000000) >> 24);
+#endif
+}
+
+/* bswap64(v) - swap the bytes of a 64-bit integer */
+static forceinline u64 bswap64(u64 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+	return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+	return _byteswap_uint64(v);
+#else
+	return ((v & 0x00000000000000FF) << 56) |
+	       ((v & 0x000000000000FF00) << 40) |
+	       ((v & 0x0000000000FF0000) << 24) |
+	       ((v & 0x00000000FF000000) << 8) |
+	       ((v & 0x000000FF00000000) >> 8) |
+	       ((v & 0x0000FF0000000000) >> 24) |
+	       ((v & 0x00FF000000000000) >> 40) |
+	       ((v & 0xFF00000000000000) >> 56);
+#endif
+}
+
+#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v))
+#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v))
+#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v))
+#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v))
+#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v))
+#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v))
+
+/* ========================================================================== */
+/*                          Unaligned memory accesses                         */
+/* ========================================================================== */
+
+/*
+ * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed
+ * efficiently on the target platform, otherwise 0.
+ */
+#if (defined(__GNUC__) || defined(__clang__)) && \
+	(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
+	 defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
+	 /*
+	  * For all compilation purposes, WebAssembly behaves like any other CPU
+	  * instruction set. Even though WebAssembly engine might be running on
+	  * top of different actual CPU architectures, the WebAssembly spec
+	  * itself permits unaligned access and it will be fast on most of those
+	  * platforms, and simulated at the engine level on others, so it's
+	  * worth treating it as a CPU architecture with fast unaligned access.
+	  */ defined(__wasm__))
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#elif defined(_MSC_VER)
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#else
+#  define UNALIGNED_ACCESS_IS_FAST	0
+#endif
+
+/*
+ * Implementing unaligned memory accesses using memcpy() is portable, and it
+ * usually gets optimized appropriately by modern compilers.  I.e., each
+ * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store
+ * instruction, not to an actual function call.
+ *
+ * We no longer use the "packed struct" approach to unaligned accesses, as that
+ * is nonstandard, has unclear semantics, and doesn't receive enough testing
+ * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
+ *
+ * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
+ * where memcpy() generates inefficient code
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366).  However, we no longer
+ * consider that one case important enough to maintain different code for.
+ * If you run into it, please just use a newer version of gcc (or use clang).
+ */
+
+#ifdef FREESTANDING
+#  define MEMCOPY	__builtin_memcpy
+#else
+#  define MEMCOPY	memcpy
+#endif
+
+/* Unaligned loads and stores without endianness conversion */
+
+#define DEFINE_UNALIGNED_TYPE(type)				\
+static forceinline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	type v;							\
+								\
+	MEMCOPY(&v, p, sizeof(v));				\
+	return v;						\
+}								\
+								\
+static forceinline void						\
+store_##type##_unaligned(type v, void *p)			\
+{								\
+	MEMCOPY(p, &v, sizeof(v));				\
+}
+
+DEFINE_UNALIGNED_TYPE(u16)
+DEFINE_UNALIGNED_TYPE(u32)
+DEFINE_UNALIGNED_TYPE(u64)
+DEFINE_UNALIGNED_TYPE(machine_word_t)
+
+#undef MEMCOPY
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+/* Unaligned loads with endianness conversion */
+
+static forceinline u16
+get_unaligned_le16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[1] << 8) | p[0];
+}
+
+static forceinline u16
+get_unaligned_be16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[0] << 8) | p[1];
+}
+
+static forceinline u32
+get_unaligned_le32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
+			((u32)p[1] << 8) | p[0];
+}
+
+static forceinline u32
+get_unaligned_be32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
+			((u32)p[2] << 8) | p[3];
+}
+
+static forceinline u64
+get_unaligned_le64(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le64_bswap(load_u64_unaligned(p));
+	else
+		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
+			((u64)p[5] << 40) | ((u64)p[4] << 32) |
+			((u64)p[3] << 24) | ((u64)p[2] << 16) |
+			((u64)p[1] << 8) | p[0];
+}
+
+static forceinline machine_word_t
+get_unaligned_leword(const u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return get_unaligned_le32(p);
+	else
+		return get_unaligned_le64(p);
+}
+
+/* Unaligned stores with endianness conversion */
+
+static forceinline void
+put_unaligned_le16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(le16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+	}
+}
+
+static forceinline void
+put_unaligned_be16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(be16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 8);
+		p[1] = (u8)(v >> 0);
+	}
+}
+
+static forceinline void
+put_unaligned_le32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(le32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+	}
+}
+
+static forceinline void
+put_unaligned_be32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(be32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 24);
+		p[1] = (u8)(v >> 16);
+		p[2] = (u8)(v >> 8);
+		p[3] = (u8)(v >> 0);
+	}
+}
+
+static forceinline void
+put_unaligned_le64(u64 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u64_unaligned(le64_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+		p[4] = (u8)(v >> 32);
+		p[5] = (u8)(v >> 40);
+		p[6] = (u8)(v >> 48);
+		p[7] = (u8)(v >> 56);
+	}
+}
+
+static forceinline void
+put_unaligned_leword(machine_word_t v, u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		put_unaligned_le32(v, p);
+	else
+		put_unaligned_le64(v, p);
+}
+
+/* ========================================================================== */
+/*                         Bit manipulation functions                         */
+/* ========================================================================== */
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant end) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static forceinline unsigned
+bsr32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clz)
+	return 31 - __builtin_clz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanReverse(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsr64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
+	return 63 - __builtin_clzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanReverse64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsrw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsr32(v);
+	else
+		return bsr64(v);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant end) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static forceinline unsigned
+bsf32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctz)
+	return __builtin_ctz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanForward(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsf64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctzll)
+	return __builtin_ctzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanForward64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsfw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsf32(v);
+	else
+		return bsf64(v);
+}
+
+/*
+ * rbit32(v): reverse the bits in a 32-bit integer.  This doesn't have a
+ * fallback implementation; use '#ifdef rbit32' to check if this is available.
+ */
+#undef rbit32
+#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \
+	(__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__)))
+static forceinline u32
+rbit32(u32 v)
+{
+	__asm__("rbit %0, %1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64)
+static forceinline u32
+rbit32(u32 v)
+{
+	__asm__("rbit %w0, %w1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#endif
+
+#endif /* COMMON_DEFS_H */
+
+
+typedef void *(*malloc_func_t)(size_t);
+typedef void (*free_func_t)(void *);
+
+extern malloc_func_t libdeflate_default_malloc_func;
+extern free_func_t libdeflate_default_free_func;
+
+void *libdeflate_aligned_malloc(malloc_func_t malloc_func,
+				size_t alignment, size_t size);
+void libdeflate_aligned_free(free_func_t free_func, void *ptr);
+
+#ifdef FREESTANDING
+/*
+ * With -ffreestanding, <string.h> may be missing, and we must provide
+ * implementations of memset(), memcpy(), memmove(), and memcmp().
+ * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html
+ *
+ * Also, -ffreestanding disables interpreting calls to these functions as
+ * built-ins.  E.g., calling memcpy(&v, p, WORDBYTES) will make a function call,
+ * not be optimized to a single load instruction.  For performance reasons we
+ * don't want that.  So, declare these functions as macros that expand to the
+ * corresponding built-ins.  This approach is recommended in the gcc man page.
+ * We still need the actual function definitions in case gcc calls them.
+ */
+void *memset(void *s, int c, size_t n);
+#define memset(s, c, n)		__builtin_memset((s), (c), (n))
+
+void *memcpy(void *dest, const void *src, size_t n);
+#define memcpy(dest, src, n)	__builtin_memcpy((dest), (src), (n))
+
+void *memmove(void *dest, const void *src, size_t n);
+#define memmove(dest, src, n)	__builtin_memmove((dest), (src), (n))
+
+int memcmp(const void *s1, const void *s2, size_t n);
+#define memcmp(s1, s2, n)	__builtin_memcmp((s1), (s2), (n))
+
+#undef LIBDEFLATE_ENABLE_ASSERTIONS
+#else
+#include <string.h>
+#endif
+
+/*
+ * Runtime assertion support.  Don't enable this in production builds; it may
+ * hurt performance significantly.
+ */
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+void libdeflate_assertion_failed(const char *expr, const char *file, int line);
+#define ASSERT(expr) { if (unlikely(!(expr))) \
+	libdeflate_assertion_failed(#expr, __FILE__, __LINE__); }
+#else
+#define ASSERT(expr) (void)(expr)
+#endif
+
+#define CONCAT_IMPL(a, b)	a##b
+#define CONCAT(a, b)		CONCAT_IMPL(a, b)
+#define ADD_SUFFIX(name)	CONCAT(name, SUFFIX)
+
+#endif /* LIB_LIB_COMMON_H */
+
+/*
+ * deflate_constants.h - constants for the DEFLATE compression format
+ */
+
+#ifndef LIB_DEFLATE_CONSTANTS_H
+#define LIB_DEFLATE_CONSTANTS_H
+
+/* Valid block types  */
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
+
+/* Minimum and maximum supported match lengths (in bytes)  */
+#define DEFLATE_MIN_MATCH_LEN			3
+#define DEFLATE_MAX_MATCH_LEN			258
+
+/* Maximum supported match offset (in bytes) */
+#define DEFLATE_MAX_MATCH_OFFSET		32768
+
+/* log2 of DEFLATE_MAX_MATCH_OFFSET */
+#define DEFLATE_WINDOW_ORDER			15
+
+/* Number of symbols in each Huffman code.  Note: for the literal/length
+ * and offset codes, these are actually the maximum values; a given block
+ * might use fewer symbols.  */
+#define DEFLATE_NUM_PRECODE_SYMS		19
+#define DEFLATE_NUM_LITLEN_SYMS			288
+#define DEFLATE_NUM_OFFSET_SYMS			32
+
+/* The maximum number of symbols across all codes  */
+#define DEFLATE_MAX_NUM_SYMS			288
+
+/* Division of symbols in the literal/length code  */
+#define DEFLATE_NUM_LITERALS			256
+#define DEFLATE_END_OF_BLOCK			256
+#define DEFLATE_FIRST_LEN_SYM			257
+
+/* Maximum codeword length, in bits, within each Huffman code  */
+#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
+
+/* The maximum codeword length across all codes  */
+#define DEFLATE_MAX_CODEWORD_LEN		15
+
+/* Maximum possible overrun when decoding codeword lengths  */
+#define DEFLATE_MAX_LENS_OVERRUN		137
+
+/*
+ * Maximum number of extra bits that may be required to represent a match
+ * length or offset.
+ */
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS		13
+
+#endif /* LIB_DEFLATE_CONSTANTS_H */
+
+/*
+ * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c
+ *
+ * Copyright 2020 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_CPU_FEATURES_COMMON_H
+#define LIB_CPU_FEATURES_COMMON_H
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+   /* for strdup() and strtok_r() */
+#  undef _ANSI_SOURCE
+#  ifndef __APPLE__
+#    undef _GNU_SOURCE
+#    define _GNU_SOURCE
+#  endif
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+#endif
+
+struct cpu_feature {
+	u32 bit;
+	const char *name;
+};
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+				 const struct cpu_feature *feature_table,
+				 size_t feature_table_length)
+{
+	char *env_value, *strbuf, *p, *saveptr = NULL;
+	size_t i;
+
+	env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
+	if (!env_value)
+		return;
+	strbuf = strdup(env_value);
+	if (!strbuf)
+		abort();
+	p = strtok_r(strbuf, ",", &saveptr);
+	while (p) {
+		for (i = 0; i < feature_table_length; i++) {
+			if (strcmp(p, feature_table[i].name) == 0) {
+				*features &= ~feature_table[i].bit;
+				break;
+			}
+		}
+		if (i == feature_table_length) {
+			fprintf(stderr,
+				"unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
+				p);
+			abort();
+		}
+		p = strtok_r(NULL, ",", &saveptr);
+	}
+	free(strbuf);
+}
+#else /* TEST_SUPPORT__DO_NOT_USE */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+				 const struct cpu_feature *feature_table,
+				 size_t feature_table_length)
+{
+}
+#endif /* !TEST_SUPPORT__DO_NOT_USE */
+
+#endif /* LIB_CPU_FEATURES_COMMON_H */
+
+/*
+ * x86/cpu_features.h - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CPU_FEATURES_H
+#define LIB_X86_CPU_FEATURES_H
+
+#define HAVE_DYNAMIC_X86_CPU_FEATURES	0
+
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)
+#  undef HAVE_DYNAMIC_X86_CPU_FEATURES
+#  define HAVE_DYNAMIC_X86_CPU_FEATURES	1
+#endif
+
+#define X86_CPU_FEATURE_SSE2		0x00000001
+#define X86_CPU_FEATURE_PCLMUL		0x00000002
+#define X86_CPU_FEATURE_AVX		0x00000004
+#define X86_CPU_FEATURE_AVX2		0x00000008
+#define X86_CPU_FEATURE_BMI2		0x00000010
+
+#define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
+#define HAVE_PCLMUL(features)	(HAVE_PCLMUL_NATIVE   || ((features) & X86_CPU_FEATURE_PCLMUL))
+#define HAVE_AVX(features)	(HAVE_AVX_NATIVE      || ((features) & X86_CPU_FEATURE_AVX))
+#define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
+#define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
+
+#if HAVE_DYNAMIC_X86_CPU_FEATURES
+#define X86_CPU_FEATURES_KNOWN		0x80000000
+extern volatile u32 libdeflate_x86_cpu_features;
+
+void libdeflate_init_x86_cpu_features(void);
+
+static inline u32 get_x86_cpu_features(void)
+{
+	if (libdeflate_x86_cpu_features == 0)
+		libdeflate_init_x86_cpu_features();
+	return libdeflate_x86_cpu_features;
+}
+#else /* HAVE_DYNAMIC_X86_CPU_FEATURES */
+static inline u32 get_x86_cpu_features(void) { return 0; }
+#endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */
+
+/*
+ * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not
+ * available in the main target couldn't be used in 'target' attribute
+ * functions.  Unfortunately clang has no feature test macro for this, so we
+ * have to check its version.
+ */
+#if HAVE_DYNAMIC_X86_CPU_FEATURES && \
+	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER))
+#  define HAVE_TARGET_INTRINSICS	1
+#else
+#  define HAVE_TARGET_INTRINSICS	0
+#endif
+
+/* SSE2 */
+#if defined(__SSE2__) || \
+	(defined(_MSC_VER) && \
+	 (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)))
+#  define HAVE_SSE2_NATIVE	1
+#else
+#  define HAVE_SSE2_NATIVE	0
+#endif
+#define HAVE_SSE2_INTRIN	(HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)
+
+/* PCLMUL */
+#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_PCLMUL_NATIVE	1
+#else
+#  define HAVE_PCLMUL_NATIVE	0
+#endif
+#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			   (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
+			    defined(_MSC_VER)))
+#  define HAVE_PCLMUL_INTRIN	1
+#else
+#  define HAVE_PCLMUL_INTRIN	0
+#endif
+
+/* AVX */
+#ifdef __AVX__
+#  define HAVE_AVX_NATIVE	1
+#else
+#  define HAVE_AVX_NATIVE	0
+#endif
+#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			(GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \
+			 defined(_MSC_VER)))
+#  define HAVE_AVX_INTRIN	1
+#else
+#  define HAVE_AVX_INTRIN	0
+#endif
+
+/* AVX2 */
+#ifdef __AVX2__
+#  define HAVE_AVX2_NATIVE	1
+#else
+#  define HAVE_AVX2_NATIVE	0
+#endif
+#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
+			  defined(_MSC_VER)))
+#  define HAVE_AVX2_INTRIN	1
+#else
+#  define HAVE_AVX2_INTRIN	0
+#endif
+
+/* BMI2 */
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_BMI2_NATIVE	1
+#else
+#  define HAVE_BMI2_NATIVE	0
+#endif
+#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
+			  defined(_MSC_VER)))
+#  define HAVE_BMI2_INTRIN	1
+#else
+#  define HAVE_BMI2_INTRIN	0
+#endif
+
+#endif /* ARCH_X86_32 || ARCH_X86_64 */
+
+#endif /* LIB_X86_CPU_FEATURES_H */
+
+
+/*
+ * If the expression passed to SAFETY_CHECK() evaluates to false, then the
+ * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
+ * compressed data is invalid.
+ *
+ * Theoretically, these checks could be disabled for specialized applications
+ * where all input to the decompressor will be trusted.
+ */
+#if 0
+#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
+#  define SAFETY_CHECK(expr)	(void)(expr)
+#else
+#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
+#endif
+
+/*****************************************************************************
+ *				Input bitstream                              *
+ *****************************************************************************/
+
+/*
+ * The state of the "input bitstream" consists of the following variables:
+ *
+ *	- in_next: a pointer to the next unread byte in the input buffer
+ *
+ *	- in_end: a pointer to just past the end of the input buffer
+ *
+ *	- bitbuf: a word-sized variable containing bits that have been read from
+ *		  the input buffer or from the implicit appended zero bytes
+ *
+ *	- bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ *		    After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ *		    contain more bits than this.  However, only the bits counted
+ *		    by 'bitsleft' can actually be consumed; the rest can only be
+ *		    used for preloading.
+ *
+ *		    As a micro-optimization, we allow bits 8 and higher of
+ *		    'bitsleft' to contain garbage.  When consuming the bits
+ *		    associated with a decode table entry, this allows us to do
+ *		    'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ *		    On some CPUs, this helps reduce instruction dependencies.
+ *		    This does have the disadvantage that 'bitsleft' sometimes
+ *		    needs to be cast to 'u8', such as when it's used as a shift
+ *		    amount in REFILL_BITS_BRANCHLESS().  But that one happens
+ *		    for free since most CPUs ignore high bits in shift amounts.
+ *
+ *	- overread_count: the total number of implicit appended zero bytes that
+ *			  have been loaded into the bitbuffer, including any
+ *			  counted by 'bitsleft' and any already consumed
+ */
+
+/*
+ * The type for the bitbuffer variable ('bitbuf' described above).  For best
+ * performance, this should have size equal to a machine word.
+ *
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
+ * which they don't have to refill as often.
+ */
+typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS	(8 * (int)sizeof(bitbuf_t))
+
+/* BITMASK(n) returns a bitmask of length 'n'. */
+#define BITMASK(n)	(((bitbuf_t)1 << (n)) - 1)
+
+/*
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
+ * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
+ */
+#define MAX_BITSLEFT	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
+
+/*
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
+ */
+#define CONSUMABLE_NBITS	(MAX_BITSLEFT - 7)
+
+/*
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP().  (It is *not*
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
+ * byte-at-a-time refill method near the end of input.)  This may exceed the
+ * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
+ */
+#define FASTLOOP_PRELOADABLE_NBITS	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+
+/*
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
+ * subsequent consumptions.  This is 1 bit if the branchless refill method is
+ * being used, and 0 bits otherwise.
+ */
+#define PRELOAD_SLACK	MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+
+/*
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
+ * refilled, then it's always possible to consume 'n' bits from it.  'n' should
+ * be a compile-time constant, to enable compile-time evaluation.
+ */
+#define CAN_CONSUME(n)	(CONSUMABLE_NBITS >= (n))
+
+/*
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
+ * arguments should be compile-time constants to enable compile-time evaluation.
+ */
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)	\
+	(CONSUMABLE_NBITS >= (consume_nbits) &&				\
+	 FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+
+/*
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
+ * reading the next word from the input buffer and updating 'in_next' and
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
+ * This is much faster than reading a byte at a time, at least if the CPU is
+ * little endian and supports fast unaligned memory accesses.
+ *
+ * The simplest way of branchlessly updating 'bitsleft' would be:
+ *
+ *	bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
+ * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
+ * 'bitsleft' by just setting the bits above the low 3 bits:
+ *
+ *	bitsleft |= MAX_BITSLEFT & ~7;
+ *
+ * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
+ *
+ * The simplest way of branchlessly updating 'in_next' would be:
+ *
+ *	in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
+ * isn't really better:
+ *
+ *	in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *
+ * An alternative which can be marginally better is the following:
+ *
+ *	in_next += sizeof(bitbuf_t) - 1;
+ *	in_next -= (bitsleft >> 3) & 0x7;
+ *
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
+ * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
+ * more efficient because the length of the longest dependency chain decreases
+ * from 3 to 2.  This alternative also has the advantage that it ignores the
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
+ * use where we let the high bits of 'bitsleft' contain garbage.
+ */
+#define REFILL_BITS_BRANCHLESS()					\
+do {									\
+	bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;	\
+	in_next += sizeof(bitbuf_t) - 1;				\
+	in_next -= (bitsleft >> 3) & 0x7;				\
+	bitsleft |= MAX_BITSLEFT & ~7;					\
+} while (0)
+
+/*
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
+ * contains at least CONSUMABLE_NBITS consumable bits.
+ *
+ * This checks for the end of input, and it doesn't guarantee
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
+ *
+ * If we would overread the input buffer, we just don't read anything, leaving
+ * the bits zeroed but marking them filled.  This simplifies the decompressor
+ * because it removes the need to always be able to distinguish between real
+ * overreads and overreads caused only by the decompressor's own lookahead.
+ *
+ * We do still keep track of the number of bytes that have been overread, for
+ * two reasons.  First, it allows us to determine the exact number of bytes that
+ * were consumed once the stream ends or an uncompressed block is reached.
+ * Second, it allows us to stop early if the overread amount gets so large (more
+ * than sizeof bitbuf) that it can only be caused by a real overread.  (The
+ * second part is arguably unneeded, since libdeflate is buffer-based; given
+ * infinite zeroes, it will eventually either completely fill the output buffer
+ * or return an error.  However, we do it to be slightly more friendly to the
+ * not-recommended use case of decompressing with an unknown output size.)
+ */
+#define REFILL_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST &&					\
+	    likely(in_end - in_next >= sizeof(bitbuf_t))) {		\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			if (likely(in_next != in_end)) {		\
+				bitbuf |= (bitbuf_t)*in_next++ <<	\
+					  (u8)bitsleft;			\
+			} else {					\
+				overread_count++;			\
+				SAFETY_CHECK(overread_count <=		\
+					     sizeof(bitbuf_t));		\
+			}						\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
+ * end of the input.  It can only be used in the fastloop.
+ */
+#define REFILL_BITS_IN_FASTLOOP()					\
+do {									\
+	STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||			\
+		      FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);	\
+	if (UNALIGNED_ACCESS_IS_FAST) {					\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;	\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * This is the worst-case maximum number of output bytes that are written to
+ * during each iteration of the fastloop.  The worst case is 2 literals, then a
+ * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
+ * be included for the intentional overrun in the match copy implementation.
+ */
+#define FASTLOOP_MAX_BYTES_WRITTEN	\
+	(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
+
+/*
+ * This is the worst-case maximum number of input bytes that are read during
+ * each iteration of the fastloop.  To get this value, we first compute the
+ * greatest number of bits that can be refilled during a loop iteration.  The
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
+ * be refilled later is no more than the maximum amount that can be consumed by
+ * 2 literals that don't need a subtable, then a match.  We convert this value
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
+ * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
+ */
+#define FASTLOOP_MAX_BYTES_READ					\
+	(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +	\
+		      LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +	\
+	 sizeof(bitbuf_t))
+
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
+
+/*
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
+ * table that maps the next TABLEBITS bits of data to their symbol.  Each entry
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'.  A
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
+ *
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
+ * compression formats are designed with this goal in mind.  Unfortunately, in
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
+ * too large for a practical TABLEBITS.  It's not *that* much larger, though, so
+ * the workaround is to use a single level of subtables.  In the main table,
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
+ * to the appropriate subtable along with the number of bits it is indexed with.
+ *
+ * The most efficient way to allocate subtables is to allocate them dynamically
+ * after the main table.  The worst-case number of table entries needed,
+ * including subtables, is precomputable; see the ENOUGH constants below.
+ *
+ * A useful optimization is to store the codeword lengths in the decode table so
+ * that they don't have to be looked up by indexing a separate table that maps
+ * symbols to their codeword lengths.  We basically do this; however, for the
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
+ * that build in the consideration of the "extra bits" and the
+ * literal/length/end-of-block division.  For the exact decode table entry
+ * format we use, see the definitions of the *_decode_results[] arrays below.
+ */
+
+
+/*
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
+ * along with their corresponding ENOUGH values.
+ *
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
+ * precode codeword length.  This avoids ever needing subtables.
+ *
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
+ * subtables, since litlen and offset codewords can be up to 15 bits.  A higher
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
+ * performance; however, it increases memory usage and makes building the table
+ * take longer, which decreases performance.  We choose values that work well in
+ * practice, making subtables rarely needed without making the tables too large.
+ *
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
+ * considerations, 9 would fit the trade-off curve better.  However, there is a
+ * performance benefit to using exactly 8 bits when it is a compile-time
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
+ *
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
+ * builds a table, it caps the actual table_bits to the longest codeword.  This
+ * makes sense in theory, as there's no need for the table to be any larger than
+ * needed to support the longest codeword.  However, having the table bits be a
+ * compile-time constant is beneficial to the performance of the decode loop, so
+ * there is a trade-off.  libdeflate currently uses the dynamic table_bits
+ * strategy for the litlen table only, due to its larger maximum size.
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
+ *
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
+ * worst-case maximum number of decode table entries, including the main table
+ * and all subtables.  The ENOUGH value depends on three parameters:
+ *
+ *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *	(2) the maximum number of main table bits (*_TABLEBITS)
+ *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
+ */
+#define PRECODE_TABLEBITS	7
+#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
+#define LITLEN_TABLEBITS	11
+#define LITLEN_ENOUGH		2342	/* enough 288 11 15	*/
+#define OFFSET_TABLEBITS	8
+#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
+
+/*
+ * make_decode_table_entry() creates a decode table entry for the given symbol
+ * by combining the static part 'decode_results[sym]' with the dynamic part
+ * 'len', which is the remaining codeword length (the codeword length for main
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
+ *
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
+ * appropriately-formatted decode table entry.  See the definitions of the
+ * *_decode_results[] arrays below, where the entry format is described.
+ */
+static forceinline u32
+make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
+{
+	return decode_results[sym] + (len << 8) + len;
+}
+
+/*
+ * Here is the format of our precode decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Bit 20-16:  presym
+ *	Bit 10-8:   codeword length [not used]
+ *	Bit 2-0:    codeword length
+ *
+ * The precode decode table never has subtables, since we use
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
+ *
+ * precode_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 precode_decode_results[] = {
+#define ENTRY(presym)	((u32)presym << 16)
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
+#undef ENTRY
+};
+
+/* Litlen and offset decode table entry flags */
+
+/* Indicates a literal entry in the litlen decode table */
+#define HUFFDEC_LITERAL			0x80000000
+
+/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
+#define HUFFDEC_EXCEPTIONAL		0x00008000
+
+/* Indicates a subtable pointer entry in the litlen or offset decode table */
+#define HUFFDEC_SUBTABLE_POINTER	0x00004000
+
+/* Indicates an end-of-block entry in the litlen decode table */
+#define HUFFDEC_END_OF_BLOCK		0x00002000
+
+/* Maximum number of bits that can be consumed by decoding a match length */
+#define LENGTH_MAXBITS		(DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS	(LITLEN_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+
+/*
+ * Here is the format of our litlen decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Literals:
+ *		Bit 31:     1 (HUFFDEC_LITERAL)
+ *		Bit 23-16:  literal value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Lengths:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 24-16:  length base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	End of block:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Subtable pointer:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 30-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * This format has several desirable properties:
+ *
+ *	- The codeword length, length slot base, and number of extra length bits
+ *	  are all built in.  This eliminates the need to separately look up this
+ *	  information by indexing separate arrays by symbol or length slot.
+ *
+ *	- The HUFFDEC_* flags enable easily distinguishing between the different
+ *	  types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
+ *	  literals; the high bit is used for this, as some CPUs can test the
+ *	  high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
+ *	  makes it possible to detect the two unlikely cases (subtable pointer
+ *	  and end of block) in a single bit flag test.
+ *
+ *	- The low byte is the number of bits that need to be removed from the
+ *	  bitstream; this makes this value easily accessible, and it enables the
+ *	  micro-optimization of doing 'bitsleft -= entry' instead of
+ *	  'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
+ *	  so they don't need to be removed separately.
+ *
+ *	- The flags in bits 15-13 are arranged to be 0 when the
+ *	  "remaining codeword length" in bits 11-8 is needed, making this value
+ *	  fairly easily accessible as well via a shift and downcast.
+ *
+ *	- Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ *	  needed, making it possible to extract this value with '& 0x3F' rather
+ *	  than '& 0xF'.  This value is only used as a shift amount, so this can
+ *	  save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *
+ * litlen_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 litlen_decode_results[] = {
+
+	/* Literals */
+#define ENTRY(literal)	(HUFFDEC_LITERAL | ((u32)literal << 16))
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+#undef ENTRY
+
+	/* End of block */
+	HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
+
+	/* Lengths */
+#define ENTRY(length_base, num_extra_bits)	\
+	(((u32)(length_base) << 16) | (num_extra_bits))
+	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+#undef ENTRY
+};
+
+/* Maximum number of bits that can be consumed by decoding a match offset */
+#define OFFSET_MAXBITS		(DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS	(OFFSET_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+
+/*
+ * Here is the format of our offset decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Offsets:
+ *		Bit 31-16:  offset base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	Subtable pointer:
+ *		Bit 31-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * These work the same way as the length entries and subtable pointer entries in
+ * the litlen decode table; see litlen_decode_results[] above.
+ */
+static const u32 offset_decode_results[] = {
+#define ENTRY(offset_base, num_extra_bits)	\
+	(((u32)(offset_base) << 16) | (num_extra_bits))
+	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
+#undef ENTRY
+};
+
+/*
+ * The main DEFLATE decompressor structure.  Since libdeflate only supports
+ * full-buffer decompression, this structure doesn't store the entire
+ * decompression state, most of which is in stack variables.  Instead, this
+ * struct just contains the decode tables and some temporary arrays used for
+ * building them, as these are too large to comfortably allocate on the stack.
+ *
+ * Storing the decode tables in the decompressor struct also allows the decode
+ * tables for the static codes to be reused whenever two static Huffman blocks
+ * are decoded without an intervening dynamic block, even across streams.
+ */
+struct libdeflate_decompressor {
+
+	/*
+	 * The arrays aren't all needed at the same time.  'precode_lens' and
+	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
+	 * Furthermore, 'lens' need not be retained after building the litlen
+	 * and offset decode tables.  In fact, 'lens' can be in union with
+	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+	 * and is built first.
+	 */
+
+	union {
+		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+		struct {
+			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+				DEFLATE_NUM_OFFSET_SYMS +
+				DEFLATE_MAX_LENS_OVERRUN];
+
+			u32 precode_decode_table[PRECODE_ENOUGH];
+		} l;
+
+		u32 litlen_decode_table[LITLEN_ENOUGH];
+	} u;
+
+	u32 offset_decode_table[OFFSET_ENOUGH];
+
+	/* used only during build_decode_table() */
+	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+	bool static_codes_loaded;
+	unsigned litlen_tablebits;
+
+	/* The free() function for this struct, chosen at allocation time */
+	free_func_t free_func;
+};
+
+/*
+ * Build a table for fast decoding of symbols from a Huffman code.  As input,
+ * this function takes the codeword length of each symbol which may be used in
+ * the code.  As output, it produces a decode table for the canonical Huffman
+ * code described by the codeword lengths.  The decode table is built with the
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
+ * low-order bit is the first bit of the codeword.  This format is used for all
+ * Huffman codes in DEFLATE.
+ *
+ * @decode_table
+ *	The array in which the decode table will be generated.  This array must
+ *	have sufficient length; see the definition of the ENOUGH numbers.
+ * @lens
+ *	An array which provides, for each symbol, the length of the
+ *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *	alias @decode_table, since nothing is written to @decode_table until all
+ *	@lens have been consumed.  All codeword lengths are assumed to be <=
+ *	@max_codeword_len but are otherwise considered untrusted.  If they do
+ *	not form a valid Huffman code, then the decode table is not built and
+ *	%false is returned.
+ * @num_syms
+ *	The number of symbols in the code, including all unused symbols.
+ * @decode_results
+ *	An array which gives the incomplete decode result for each symbol.  The
+ *	needed values in this array will be combined with codeword lengths to
+ *	make the final decode table entries using make_decode_table_entry().
+ * @table_bits
+ *	The log base-2 of the number of main table entries to use.
+ *	If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ *	value and it will be decreased if a smaller table would be sufficient.
+ * @max_codeword_len
+ *	The maximum allowed codeword length for this Huffman code.
+ *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
+ * @sorted_syms
+ *	A temporary array of length @num_syms.
+ * @table_bits_ret
+ *	If non-NULL, then the dynamic table_bits is enabled, and the actual
+ *	table_bits value will be returned here.
+ *
+ * Returns %true if successful; %false if the codeword lengths do not form a
+ * valid Huffman code.
+ */
+static bool
+build_decode_table(u32 decode_table[],
+		   const u8 lens[],
+		   const unsigned num_syms,
+		   const u32 decode_results[],
+		   unsigned table_bits,
+		   unsigned max_codeword_len,
+		   u16 *sorted_syms,
+		   unsigned *table_bits_ret)
+{
+	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned sym;		/* current symbol */
+	unsigned codeword;	/* current codeword, bit-reversed */
+	unsigned len;		/* current codeword length in bits */
+	unsigned count;		/* num codewords remaining with this length */
+	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
+	unsigned cur_table_end; /* end index of current table */
+	unsigned subtable_prefix; /* codeword prefix of current subtable */
+	unsigned subtable_start;  /* start index of current subtable */
+	unsigned subtable_bits;   /* log2 of current subtable length */
+
+	/* Count how many codewords have each length, including 0. */
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (sym = 0; sym < num_syms; sym++)
+		len_counts[lens[sym]]++;
+
+	/*
+	 * Determine the actual maximum codeword length that was used, and
+	 * decrease table_bits to it if allowed.
+	 */
+	while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+		max_codeword_len--;
+	if (table_bits_ret != NULL) {
+		table_bits = MIN(table_bits, max_codeword_len);
+		*table_bits_ret = table_bits;
+	}
+
+	/*
+	 * Sort the symbols primarily by increasing codeword length and
+	 * secondarily by increasing symbol value; or equivalently by their
+	 * codewords in lexicographic order, since a canonical code is assumed.
+	 *
+	 * For efficiency, also compute 'codespace_used' in the same pass over
+	 * 'len_counts[]' used to build 'offsets[]' for sorting.
+	 */
+
+	/* Ensure that 'codespace_used' cannot overflow. */
+	STATIC_ASSERT(sizeof(codespace_used) == 4);
+	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
+		      DEFLATE_MAX_NUM_SYMS);
+
+	offsets[0] = 0;
+	offsets[1] = len_counts[0];
+	codespace_used = 0;
+	for (len = 1; len < max_codeword_len; len++) {
+		offsets[len + 1] = offsets[len] + len_counts[len];
+		codespace_used = (codespace_used << 1) + len_counts[len];
+	}
+	codespace_used = (codespace_used << 1) + len_counts[len];
+
+	for (sym = 0; sym < num_syms; sym++)
+		sorted_syms[offsets[lens[sym]]++] = sym;
+
+	sorted_syms += offsets[0]; /* Skip unused symbols */
+
+	/* lens[] is done being used, so we can write to decode_table[] now. */
+
+	/*
+	 * Check whether the lengths form a complete code (exactly fills the
+	 * codespace), an incomplete code (doesn't fill the codespace), or an
+	 * overfull code (overflows the codespace).  A codeword of length 'n'
+	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
+	 * nonsensical, so is considered invalid.  An incomplete code is
+	 * considered valid only in two specific cases; see below.
+	 */
+
+	/* overfull code? */
+	if (unlikely(codespace_used > (1U << max_codeword_len)))
+		return false;
+
+	/* incomplete code? */
+	if (unlikely(codespace_used < (1U << max_codeword_len))) {
+		u32 entry;
+		unsigned i;
+
+		if (codespace_used == 0) {
+			/*
+			 * An empty code is allowed.  This can happen for the
+			 * offset code in DEFLATE, since a dynamic Huffman block
+			 * need not contain any matches.
+			 */
+
+			/* sym=0, len=1 (arbitrary) */
+			entry = make_decode_table_entry(decode_results, 0, 1);
+		} else {
+			/*
+			 * Allow codes with a single used symbol, with codeword
+			 * length 1.  The DEFLATE RFC is unclear regarding this
+			 * case.  What zlib's decompressor does is permit this
+			 * for the litlen and offset codes and assume the
+			 * codeword is '0' rather than '1'.  We do the same
+			 * except we allow this for precodes too, since there's
+			 * no convincing reason to treat the codes differently.
+			 * We also assign both codewords '0' and '1' to the
+			 * symbol to avoid having to handle '1' specially.
+			 */
+			if (codespace_used != (1U << (max_codeword_len - 1)) ||
+			    len_counts[1] != 1)
+				return false;
+			entry = make_decode_table_entry(decode_results,
+							*sorted_syms, 1);
+		}
+		/*
+		 * Note: the decode table still must be fully initialized, in
+		 * case the stream is malformed and contains bits from the part
+		 * of the codespace the incomplete code doesn't use.
+		 */
+		for (i = 0; i < (1U << table_bits); i++)
+			decode_table[i] = entry;
+		return true;
+	}
+
+	/*
+	 * The lengths form a complete code.  Now, enumerate the codewords in
+	 * lexicographic order and fill the decode table entries for each one.
+	 *
+	 * First, process all codewords with len <= table_bits.  Each one gets
+	 * '2^(table_bits-len)' direct entries in the table.
+	 *
+	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
+	 * consecutive but rather are spaced '2^len' entries apart.  This makes
+	 * filling them naively somewhat awkward and inefficient, since strided
+	 * stores are less cache-friendly and preclude the use of word or
+	 * vector-at-a-time stores to fill multiple entries per instruction.
+	 *
+	 * To optimize this, we incrementally double the table size.  When
+	 * processing codewords with length 'len', the table is treated as
+	 * having only '2^len' entries, so each codeword uses just one entry.
+	 * Then, each time 'len' is incremented, the table size is doubled and
+	 * the first half is copied to the second half.  This significantly
+	 * improves performance over naively doing strided stores.
+	 *
+	 * Note that some entries copied for each table doubling may not have
+	 * been initialized yet, but it doesn't matter since they're guaranteed
+	 * to be initialized later (because the Huffman code is complete).
+	 */
+	codeword = 0;
+	len = 1;
+	while ((count = len_counts[len]) == 0)
+		len++;
+	cur_table_end = 1U << len;
+	while (len <= table_bits) {
+		/* Process all 'count' codewords with length 'len' bits. */
+		do {
+			unsigned bit;
+
+			/* Fill the first entry for the current codeword. */
+			decode_table[codeword] =
+				make_decode_table_entry(decode_results,
+							*sorted_syms++, len);
+
+			if (codeword == cur_table_end - 1) {
+				/* Last codeword (all 1's) */
+				for (; len < table_bits; len++) {
+					memcpy(&decode_table[cur_table_end],
+					       decode_table,
+					       cur_table_end *
+						sizeof(decode_table[0]));
+					cur_table_end <<= 1;
+				}
+				return true;
+			}
+			/*
+			 * To advance to the lexicographically next codeword in
+			 * the canonical code, the codeword must be incremented,
+			 * then 0's must be appended to the codeword as needed
+			 * to match the next codeword's length.
+			 *
+			 * Since the codeword is bit-reversed, appending 0's is
+			 * a no-op.  However, incrementing it is nontrivial.  To
+			 * do so efficiently, use the 'bsr' instruction to find
+			 * the last (highest order) 0 bit in the codeword, set
+			 * it, and clear any later (higher order) 1 bits.  But
+			 * 'bsr' actually finds the highest order 1 bit, so to
+			 * use it first flip all bits in the codeword by XOR'ing
+			 * it with (1U << len) - 1 == cur_table_end - 1.
+			 */
+			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
+			codeword &= bit - 1;
+			codeword |= bit;
+		} while (--count);
+
+		/* Advance to the next codeword length. */
+		do {
+			if (++len <= table_bits) {
+				memcpy(&decode_table[cur_table_end],
+				       decode_table,
+				       cur_table_end * sizeof(decode_table[0]));
+				cur_table_end <<= 1;
+			}
+		} while ((count = len_counts[len]) == 0);
+	}
+
+	/* Process codewords with len > table_bits.  These require subtables. */
+	cur_table_end = 1U << table_bits;
+	subtable_prefix = -1;
+	subtable_start = 0;
+	for (;;) {
+		u32 entry;
+		unsigned i;
+		unsigned stride;
+		unsigned bit;
+
+		/*
+		 * Start a new subtable if the first 'table_bits' bits of the
+		 * codeword don't match the prefix of the current subtable.
+		 */
+		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
+			subtable_prefix = (codeword & ((1U << table_bits) - 1));
+			subtable_start = cur_table_end;
+			/*
+			 * Calculate the subtable length.  If the codeword has
+			 * length 'table_bits + n', then the subtable needs
+			 * '2^n' entries.  But it may need more; if fewer than
+			 * '2^n' codewords of length 'table_bits + n' remain,
+			 * then the length will need to be incremented to bring
+			 * in longer codewords until the subtable can be
+			 * completely filled.  Note that because the Huffman
+			 * code is complete, it will always be possible to fill
+			 * the subtable eventually.
+			 */
+			subtable_bits = len - table_bits;
+			codespace_used = count;
+			while (codespace_used < (1U << subtable_bits)) {
+				subtable_bits++;
+				codespace_used = (codespace_used << 1) +
+					len_counts[table_bits + subtable_bits];
+			}
+			cur_table_end = subtable_start + (1U << subtable_bits);
+
+			/*
+			 * Create the entry that points from the main table to
+			 * the subtable.
+			 */
+			decode_table[subtable_prefix] =
+				((u32)subtable_start << 16) |
+				HUFFDEC_EXCEPTIONAL |
+				HUFFDEC_SUBTABLE_POINTER |
+				(subtable_bits << 8) | table_bits;
+		}
+
+		/* Fill the subtable entries for the current codeword. */
+		entry = make_decode_table_entry(decode_results, *sorted_syms++,
+						len - table_bits);
+		i = subtable_start + (codeword >> table_bits);
+		stride = 1U << (len - table_bits);
+		do {
+			decode_table[i] = entry;
+			i += stride;
+		} while (i < cur_table_end);
+
+		/* Advance to the next codeword. */
+		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
+			return true;
+		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
+		codeword &= bit - 1;
+		codeword |= bit;
+		count--;
+		while (count == 0)
+			count = len_counts[++len];
+	}
+}
+
+/* Build the decode table for the precode.  */
+static bool
+build_precode_decode_table(struct libdeflate_decompressor *d)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+
+	STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+		      DEFLATE_NUM_PRECODE_SYMS);
+
+	return build_decode_table(d->u.l.precode_decode_table,
+				  d->u.precode_lens,
+				  DEFLATE_NUM_PRECODE_SYMS,
+				  precode_decode_results,
+				  PRECODE_TABLEBITS,
+				  DEFLATE_MAX_PRE_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/* Build the decode table for the literal/length code.  */
+static bool
+build_litlen_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+
+	STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
+
+	return build_decode_table(d->u.litlen_decode_table,
+				  d->u.l.lens,
+				  num_litlen_syms,
+				  litlen_decode_results,
+				  LITLEN_TABLEBITS,
+				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+				  d->sorted_syms,
+				  &d->litlen_tablebits);
+}
+
+/* Build the decode table for the offset code.  */
+static bool
+build_offset_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+
+	STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+		      DEFLATE_NUM_OFFSET_SYMS);
+
+	return build_decode_table(d->offset_decode_table,
+				  d->u.l.lens + num_litlen_syms,
+				  num_offset_syms,
+				  offset_decode_results,
+				  OFFSET_TABLEBITS,
+				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/*****************************************************************************
+ *                         Main decompression routine
+ *****************************************************************************/
+
+typedef enum libdeflate_result (*decompress_func_t)
+	(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+#define FUNCNAME deflate_decompress_default
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
+/*
+ * decompress_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is the actual DEFLATE decompression routine, lifted out of
+ * deflate_decompress.c so that it can be compiled multiple times with different
+ * target instruction sets.
+ */
+
+#ifndef ATTRIBUTES
+#  define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+FUNCNAME(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	u8 *out_next = (u8*)out;
+	u8 * const out_end = out_next + out_nbytes_avail;
+	u8 * const out_fastloop_end =
+		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+	/* Input bitstream state; see deflate_decompress.c for documentation */
+	const u8 *in_next = (u8*)in;
+	const u8 * const in_end = in_next + in_nbytes;
+	const u8 * const in_fastloop_end =
+		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
+	bitbuf_t bitbuf = 0;
+	bitbuf_t saved_bitbuf;
+	u32 bitsleft = 0;
+	size_t overread_count = 0;
+
+	bool is_final_block;
+	unsigned block_type;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+	bitbuf_t litlen_tablemask;
+	u32 entry;
+
+next_block:
+	/* Starting to read the next block */
+	;
+
+	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+	REFILL_BITS();
+
+	/* BFINAL: 1 bit */
+	is_final_block = bitbuf & BITMASK(1);
+
+	/* BTYPE: 2 bits */
+	block_type = (bitbuf >> 1) & BITMASK(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block */
+
+		/* The order in which precode lengths are stored */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+		unsigned i;
+
+		/* Read the codeword length counts. */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
+
+		d->static_codes_loaded = false;
+
+		/*
+		 * Read the precode codeword lengths.
+		 *
+		 * A 64-bit bitbuffer is just one bit too small to hold the
+		 * maximum number of precode lens, so to minimize branches we
+		 * merge one len with the previous fields.
+		 */
+		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+				(bitbuf >> 17) & BITMASK(3);
+			bitbuf >>= 20;
+			bitsleft -= 20;
+			REFILL_BITS();
+			i = 1;
+			do {
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		} else {
+			bitbuf >>= 17;
+			bitsleft -= 17;
+			i = 0;
+			do {
+				if ((u8)bitsleft < 3)
+					REFILL_BITS();
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		}
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode. */
+		SAFETY_CHECK(build_precode_decode_table(d));
+
+		/* Decode the litlen and offset codeword lengths. */
+		i = 0;
+		do {
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+				REFILL_BITS();
+
+			/*
+			 * The code below assumes that the precode decode table
+			 * doesn't have any subtables.
+			 */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Decode the next precode symbol. */
+			entry = d->u.l.precode_decode_table[
+				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry; /* optimization: subtract full entry */
+			presym = entry >> 16;
+
+			if (presym < 16) {
+				/* Explicit codeword length */
+				d->u.l.lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths */
+
+			/*
+			 * Note: we don't need to immediately verify that the
+			 * repeat count doesn't overflow the number of elements,
+			 * since we've sized the lens array to have enough extra
+			 * space to allow for the worst-case overrun (138 zeroes
+			 * when only 1 length was remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times. */
+				SAFETY_CHECK(i != 0);
+				rep_val = d->u.l.lens[i - 1];
+				STATIC_ASSERT(3 + BITMASK(2) == 6);
+				rep_count = 3 + (bitbuf & BITMASK(2));
+				bitbuf >>= 2;
+				bitsleft -= 2;
+				d->u.l.lens[i + 0] = rep_val;
+				d->u.l.lens[i + 1] = rep_val;
+				d->u.l.lens[i + 2] = rep_val;
+				d->u.l.lens[i + 3] = rep_val;
+				d->u.l.lens[i + 4] = rep_val;
+				d->u.l.lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times. */
+				STATIC_ASSERT(3 + BITMASK(3) == 10);
+				rep_count = 3 + (bitbuf & BITMASK(3));
+				bitbuf >>= 3;
+				bitsleft -= 3;
+				d->u.l.lens[i + 0] = 0;
+				d->u.l.lens[i + 1] = 0;
+				d->u.l.lens[i + 2] = 0;
+				d->u.l.lens[i + 3] = 0;
+				d->u.l.lens[i + 4] = 0;
+				d->u.l.lens[i + 5] = 0;
+				d->u.l.lens[i + 6] = 0;
+				d->u.l.lens[i + 7] = 0;
+				d->u.l.lens[i + 8] = 0;
+				d->u.l.lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times. */
+				STATIC_ASSERT(11 + BITMASK(7) == 138);
+				rep_count = 11 + (bitbuf & BITMASK(7));
+				bitbuf >>= 7;
+				bitsleft -= 7;
+				memset(&d->u.l.lens[i], 0,
+				       rep_count * sizeof(d->u.l.lens[i]));
+				i += rep_count;
+			}
+		} while (i < num_litlen_syms + num_offset_syms);
+
+		/* Unnecessary, but check this for consistency with zlib. */
+		SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
+
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+		u16 len, nlen;
+
+		/*
+		 * Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.
+		 */
+
+		bitsleft -= 3; /* for BTYPE and BFINAL */
+
+		/*
+		 * Align the bitstream to the next byte boundary.  This means
+		 * the next byte boundary as if we were reading a byte at a
+		 * time.  Therefore, we have to rewind 'in_next' by any bytes
+		 * that have been refilled but not actually consumed yet (not
+		 * counting overread bytes, which don't increment 'in_next').
+		 */
+		bitsleft = (u8)bitsleft;
+		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+		in_next -= (bitsleft >> 3) - overread_count;
+		overread_count = 0;
+		bitbuf = 0;
+		bitsleft = 0;
+
+		SAFETY_CHECK(in_end - in_next >= 4);
+		len = get_unaligned_le16(in_next);
+		nlen = get_unaligned_le16(in_next + 2);
+		in_next += 4;
+
+		SAFETY_CHECK(len == (u16)~nlen);
+		if (unlikely(len > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+		SAFETY_CHECK(len <= in_end - in_next);
+
+		memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else {
+		unsigned i;
+
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+		/*
+		 * Static Huffman block: build the decode tables for the static
+		 * codes.  Skip doing so if the tables are already set up from
+		 * an earlier static block; this speeds up decompression of
+		 * degenerate input of many empty or very short static blocks.
+		 *
+		 * Afterwards, the remainder is the same as decompressing a
+		 * dynamic Huffman block.
+		 */
+
+		bitbuf >>= 3; /* for BTYPE and BFINAL */
+		bitsleft -= 3;
+
+		if (d->static_codes_loaded)
+			goto have_decode_tables;
+
+		d->static_codes_loaded = true;
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+		for (i = 0; i < 144; i++)
+			d->u.l.lens[i] = 8;
+		for (; i < 256; i++)
+			d->u.l.lens[i] = 9;
+		for (; i < 280; i++)
+			d->u.l.lens[i] = 7;
+		for (; i < 288; i++)
+			d->u.l.lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->u.l.lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static) */
+
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+have_decode_tables:
+	litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+	/*
+	 * This is the "fastloop" for decoding literals and matches.  It does
+	 * bounds checks on in_next and out_next in the loop conditions so that
+	 * additional bounds checks aren't needed inside the loop body.
+	 *
+	 * To reduce latency, the bitbuffer is refilled and the next litlen
+	 * decode table entry is preloaded before each loop iteration.
+	 */
+	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+		goto generic_loop;
+	REFILL_BITS_IN_FASTLOOP();
+	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+	do {
+		u32 length, offset, lit;
+		const u8 *src;
+		u8 *dst;
+
+		/*
+		 * Consume the bits for the litlen decode table entry.  Save the
+		 * original bitbuf for later, in case the extra match length
+		 * bits need to be extracted from it.
+		 */
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+
+		/*
+		 * Begin by checking for a "fast" literal, i.e. a literal that
+		 * doesn't need a subtable.
+		 */
+		if (entry & HUFFDEC_LITERAL) {
+			/*
+			 * On 64-bit platforms, we decode up to 2 extra fast
+			 * literals in addition to the primary item, as this
+			 * increases performance and still leaves enough bits
+			 * remaining for what follows.  We could actually do 3,
+			 * assuming LITLEN_TABLEBITS=11, but that actually
+			 * decreases performance slightly (perhaps by messing
+			 * with the branch prediction of the conditional refill
+			 * that happens later while decoding the match offset).
+			 *
+			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+			 * number of extra literals decoded here is changed.
+			 */
+			if (/* enough bits for 2 fast literals + length + offset preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 LENGTH_MAXBITS,
+							 OFFSET_TABLEBITS) &&
+			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							 LITLEN_TABLEBITS)) {
+				/* 1st extra fast literal */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				saved_bitbuf = bitbuf;
+				bitbuf >>= (u8)entry;
+				bitsleft -= entry;
+				*out_next++ = lit;
+				if (entry & HUFFDEC_LITERAL) {
+					/* 2nd extra fast literal */
+					lit = entry >> 16;
+					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+					saved_bitbuf = bitbuf;
+					bitbuf >>= (u8)entry;
+					bitsleft -= entry;
+					*out_next++ = lit;
+					if (entry & HUFFDEC_LITERAL) {
+						/*
+						 * Another fast literal, but
+						 * this one is in lieu of the
+						 * primary item, so it doesn't
+						 * count as one of the extras.
+						 */
+						lit = entry >> 16;
+						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+						REFILL_BITS_IN_FASTLOOP();
+						*out_next++ = lit;
+						continue;
+					}
+				}
+			} else {
+				/*
+				 * Decode a literal.  While doing so, preload
+				 * the next litlen decode table entry and refill
+				 * the bitbuffer.  To reduce latency, we've
+				 * arranged for there to be enough "preloadable"
+				 * bits remaining to do the table preload
+				 * independently of the refill.
+				 */
+				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+		}
+
+		/*
+		 * It's not a literal entry, so it can be a length entry, a
+		 * subtable pointer entry, or an end-of-block entry.  Detect the
+		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+		 */
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			/* Subtable pointer or end-of-block entry */
+
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+
+			/*
+			 * A subtable is required.  Load and consume the
+			 * subtable entry.  The subtable entry can be of any
+			 * type: literal, length, or end-of-block.
+			 */
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+
+			/*
+			 * 32-bit platforms that use the byte-at-a-time refill
+			 * method have to do a refill here for there to always
+			 * be enough bits to decode a literal that requires a
+			 * subtable, then preload the next litlen decode table
+			 * entry; or to decode a match length that requires a
+			 * subtable, then preload the offset decode table entry.
+			 */
+			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							  LITLEN_TABLEBITS) ||
+			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+							  OFFSET_TABLEBITS))
+				REFILL_BITS_IN_FASTLOOP();
+			if (entry & HUFFDEC_LITERAL) {
+				/* Decode a literal that required a subtable. */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+			/* Else, it's a length that required a subtable. */
+		}
+
+		/*
+		 * Decode the match length: the length base value associated
+		 * with the litlen symbol (which we extract from the decode
+		 * table entry), plus the extra length bits.  We don't need to
+		 * consume the extra length bits here, as they were included in
+		 * the bits consumed by the entry earlier.  We also don't need
+		 * to check for too-long matches here, as this is inside the
+		 * fastloop where it's already been verified that the output
+		 * buffer has enough space remaining to copy a max-length match.
+		 */
+		length = entry >> 16;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/*
+		 * Decode the match offset.  There are enough "preloadable" bits
+		 * remaining to preload the offset decode table entry, but a
+		 * refill might be needed before consuming it.
+		 */
+		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+							   OFFSET_TABLEBITS));
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+						 LITLEN_TABLEBITS)) {
+			/*
+			 * Decoding a match offset on a 64-bit platform.  We may
+			 * need to refill once, but then we can decode the whole
+			 * offset and preload the next litlen table entry.
+			 */
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+					     LITLEN_TABLEBITS - PRELOAD_SLACK))
+					REFILL_BITS_IN_FASTLOOP();
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+					    LITLEN_TABLEBITS - PRELOAD_SLACK))
+				REFILL_BITS_IN_FASTLOOP();
+		} else {
+			/* Decoding a match offset on a 32-bit platform */
+			REFILL_BITS_IN_FASTLOOP();
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+				REFILL_BITS_IN_FASTLOOP();
+				/* No further refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(
+					OFFSET_MAXBITS - OFFSET_TABLEBITS));
+			} else {
+				/* No refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+			}
+		}
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/* Validate the match offset; needed even in the fastloop. */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		/*
+		 * Before starting to issue the instructions to copy the match,
+		 * refill the bitbuffer and preload the litlen decode table
+		 * entry for the next loop iteration.  This can increase
+		 * performance by allowing the latency of the match copy to
+		 * overlap with these other operations.  To further reduce
+		 * latency, we've arranged for there to be enough bits remaining
+		 * to do the table preload independently of the refill, except
+		 * on 32-bit platforms using the byte-at-a-time refill method.
+		 */
+		if (!CAN_CONSUME_AND_THEN_PRELOAD(
+			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+			    OFFSET_MAXFASTBITS),
+			LITLEN_TABLEBITS) &&
+		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+			REFILL_BITS_IN_FASTLOOP();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		REFILL_BITS_IN_FASTLOOP();
+
+		/*
+		 * Copy the match.  On most CPUs the fastest method is a
+		 * word-at-a-time copy, unconditionally copying about 5 words
+		 * since this is enough for most matches without being too much.
+		 *
+		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
+		 * which is most cases.  The case of offset == 1 is also common
+		 * and is worth optimizing for, since it is just RLE encoding of
+		 * the previous byte, which is the result of compressing long
+		 * runs of the same byte.
+		 *
+		 * Writing past the match 'length' is allowed here, since it's
+		 * been ensured there is enough output space left for a slight
+		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+		 * the maximum possible overrun here is changed.
+		 */
+		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+			machine_word_t v;
+
+			/*
+			 * This part tends to get auto-vectorized, so keep it
+			 * copying a multiple of 16 bytes at a time.
+			 */
+			v = (machine_word_t)0x0101010101010101 * src[0];
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			do {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+			} while (dst < out_next);
+		} else {
+			*dst++ = *src++;
+			*dst++ = *src++;
+			do {
+				*dst++ = *src++;
+			} while (dst < out_next);
+		}
+	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+	/*
+	 * This is the generic loop for decoding literals and matches.  This
+	 * handles cases where in_next and out_next are close to the end of
+	 * their respective buffers.  Usually this loop isn't performance-
+	 * critical, as most time is spent in the fastloop above instead.  We
+	 * therefore omit some optimizations here in favor of smaller code.
+	 */
+generic_loop:
+	for (;;) {
+		u32 length, offset;
+		const u8 *src;
+		u8 *dst;
+
+		REFILL_BITS();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+		}
+		length = entry >> 16;
+		if (entry & HUFFDEC_LITERAL) {
+			if (unlikely(out_next == out_end))
+				return LIBDEFLATE_INSUFFICIENT_SPACE;
+			*out_next++ = length;
+			continue;
+		}
+		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+			goto block_done;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+		if (unlikely(length > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+			REFILL_BITS();
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			bitbuf >>= OFFSET_TABLEBITS;
+			bitsleft -= OFFSET_TABLEBITS;
+			entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			if (!CAN_CONSUME(OFFSET_MAXBITS))
+				REFILL_BITS();
+		}
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+		*dst++ = *src++;
+		*dst++ = *src++;
+		do {
+			*dst++ = *src++;
+		} while (dst < out_next);
+	}
+
+block_done:
+	/* Finished decoding a block */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block. */
+
+	bitsleft = (u8)bitsleft;
+
+	/*
+	 * If any of the implicit appended zero bytes were consumed (not just
+	 * refilled) before hitting end of stream, then the data is bad.
+	 */
+	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+	/* Optionally return the actual number of bytes consumed. */
+	if (actual_in_nbytes_ret) {
+		/* Don't count bytes that were refilled but not consumed. */
+		in_next -= (bitsleft >> 3) - overread_count;
+
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+	}
+
+	/* Optionally return the actual number of bytes written. */
+	if (actual_out_nbytes_ret) {
+		*actual_out_nbytes_ret = out_next - (u8 *)out;
+	} else {
+		if (out_next != out_end)
+			return LIBDEFLATE_SHORT_OUTPUT;
+	}
+	return LIBDEFLATE_SUCCESS;
+}
+
+#undef FUNCNAME
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
+
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_decompress_func
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+
+/*
+ * BMI2 optimized version
+ *
+ * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation
+ * enabled yet.  That would require that this be moved to its own .c file.
+ */
+#if HAVE_BMI2_INTRIN
+#  define deflate_decompress_bmi2	deflate_decompress_bmi2
+#  define FUNCNAME			deflate_decompress_bmi2
+#  if !HAVE_BMI2_NATIVE
+#    define ATTRIBUTES			_target_attribute("bmi2")
+#  endif
+   /*
+    * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+    * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
+    * explicitly.  EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+    * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+    * Nevertheless, their implementation using the bzhi intrinsic is identical,
+    * as the bzhi instruction truncates the count to 8 bits implicitly.
+    */
+#  ifndef __clang__
+#    include <immintrin.h>
+#    ifdef ARCH_X86_64
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+#    else
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u32((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+#    endif
+#  endif
+/*
+ * decompress_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is the actual DEFLATE decompression routine, lifted out of
+ * deflate_decompress.c so that it can be compiled multiple times with different
+ * target instruction sets.
+ */
+
+#ifndef ATTRIBUTES
+#  define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+FUNCNAME(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	u8 *out_next = (u8*)out;
+	u8 * const out_end = out_next + out_nbytes_avail;
+	u8 * const out_fastloop_end =
+		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+	/* Input bitstream state; see deflate_decompress.c for documentation */
+	const u8 *in_next = (u8*)in;
+	const u8 * const in_end = in_next + in_nbytes;
+	const u8 * const in_fastloop_end =
+		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
+	bitbuf_t bitbuf = 0;
+	bitbuf_t saved_bitbuf;
+	u32 bitsleft = 0;
+	size_t overread_count = 0;
+
+	bool is_final_block;
+	unsigned block_type;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+	bitbuf_t litlen_tablemask;
+	u32 entry;
+
+next_block:
+	/* Starting to read the next block */
+	;
+
+	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+	REFILL_BITS();
+
+	/* BFINAL: 1 bit */
+	is_final_block = bitbuf & BITMASK(1);
+
+	/* BTYPE: 2 bits */
+	block_type = (bitbuf >> 1) & BITMASK(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block */
+
+		/* The order in which precode lengths are stored */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+		unsigned i;
+
+		/* Read the codeword length counts. */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
+
+		d->static_codes_loaded = false;
+
+		/*
+		 * Read the precode codeword lengths.
+		 *
+		 * A 64-bit bitbuffer is just one bit too small to hold the
+		 * maximum number of precode lens, so to minimize branches we
+		 * merge one len with the previous fields.
+		 */
+		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+				(bitbuf >> 17) & BITMASK(3);
+			bitbuf >>= 20;
+			bitsleft -= 20;
+			REFILL_BITS();
+			i = 1;
+			do {
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		} else {
+			bitbuf >>= 17;
+			bitsleft -= 17;
+			i = 0;
+			do {
+				if ((u8)bitsleft < 3)
+					REFILL_BITS();
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		}
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode. */
+		SAFETY_CHECK(build_precode_decode_table(d));
+
+		/* Decode the litlen and offset codeword lengths. */
+		i = 0;
+		do {
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+				REFILL_BITS();
+
+			/*
+			 * The code below assumes that the precode decode table
+			 * doesn't have any subtables.
+			 */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Decode the next precode symbol. */
+			entry = d->u.l.precode_decode_table[
+				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry; /* optimization: subtract full entry */
+			presym = entry >> 16;
+
+			if (presym < 16) {
+				/* Explicit codeword length */
+				d->u.l.lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths */
+
+			/*
+			 * Note: we don't need to immediately verify that the
+			 * repeat count doesn't overflow the number of elements,
+			 * since we've sized the lens array to have enough extra
+			 * space to allow for the worst-case overrun (138 zeroes
+			 * when only 1 length was remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times. */
+				SAFETY_CHECK(i != 0);
+				rep_val = d->u.l.lens[i - 1];
+				STATIC_ASSERT(3 + BITMASK(2) == 6);
+				rep_count = 3 + (bitbuf & BITMASK(2));
+				bitbuf >>= 2;
+				bitsleft -= 2;
+				d->u.l.lens[i + 0] = rep_val;
+				d->u.l.lens[i + 1] = rep_val;
+				d->u.l.lens[i + 2] = rep_val;
+				d->u.l.lens[i + 3] = rep_val;
+				d->u.l.lens[i + 4] = rep_val;
+				d->u.l.lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times. */
+				STATIC_ASSERT(3 + BITMASK(3) == 10);
+				rep_count = 3 + (bitbuf & BITMASK(3));
+				bitbuf >>= 3;
+				bitsleft -= 3;
+				d->u.l.lens[i + 0] = 0;
+				d->u.l.lens[i + 1] = 0;
+				d->u.l.lens[i + 2] = 0;
+				d->u.l.lens[i + 3] = 0;
+				d->u.l.lens[i + 4] = 0;
+				d->u.l.lens[i + 5] = 0;
+				d->u.l.lens[i + 6] = 0;
+				d->u.l.lens[i + 7] = 0;
+				d->u.l.lens[i + 8] = 0;
+				d->u.l.lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times. */
+				STATIC_ASSERT(11 + BITMASK(7) == 138);
+				rep_count = 11 + (bitbuf & BITMASK(7));
+				bitbuf >>= 7;
+				bitsleft -= 7;
+				memset(&d->u.l.lens[i], 0,
+				       rep_count * sizeof(d->u.l.lens[i]));
+				i += rep_count;
+			}
+		} while (i < num_litlen_syms + num_offset_syms);
+
+		/* Unnecessary, but check this for consistency with zlib. */
+		SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
+
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+		u16 len, nlen;
+
+		/*
+		 * Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.
+		 */
+
+		bitsleft -= 3; /* for BTYPE and BFINAL */
+
+		/*
+		 * Align the bitstream to the next byte boundary.  This means
+		 * the next byte boundary as if we were reading a byte at a
+		 * time.  Therefore, we have to rewind 'in_next' by any bytes
+		 * that have been refilled but not actually consumed yet (not
+		 * counting overread bytes, which don't increment 'in_next').
+		 */
+		bitsleft = (u8)bitsleft;
+		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+		in_next -= (bitsleft >> 3) - overread_count;
+		overread_count = 0;
+		bitbuf = 0;
+		bitsleft = 0;
+
+		SAFETY_CHECK(in_end - in_next >= 4);
+		len = get_unaligned_le16(in_next);
+		nlen = get_unaligned_le16(in_next + 2);
+		in_next += 4;
+
+		SAFETY_CHECK(len == (u16)~nlen);
+		if (unlikely(len > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+		SAFETY_CHECK(len <= in_end - in_next);
+
+		memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else {
+		unsigned i;
+
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+		/*
+		 * Static Huffman block: build the decode tables for the static
+		 * codes.  Skip doing so if the tables are already set up from
+		 * an earlier static block; this speeds up decompression of
+		 * degenerate input of many empty or very short static blocks.
+		 *
+		 * Afterwards, the remainder is the same as decompressing a
+		 * dynamic Huffman block.
+		 */
+
+		bitbuf >>= 3; /* for BTYPE and BFINAL */
+		bitsleft -= 3;
+
+		if (d->static_codes_loaded)
+			goto have_decode_tables;
+
+		d->static_codes_loaded = true;
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+		for (i = 0; i < 144; i++)
+			d->u.l.lens[i] = 8;
+		for (; i < 256; i++)
+			d->u.l.lens[i] = 9;
+		for (; i < 280; i++)
+			d->u.l.lens[i] = 7;
+		for (; i < 288; i++)
+			d->u.l.lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->u.l.lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static) */
+
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+have_decode_tables:
+	litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+	/*
+	 * This is the "fastloop" for decoding literals and matches.  It does
+	 * bounds checks on in_next and out_next in the loop conditions so that
+	 * additional bounds checks aren't needed inside the loop body.
+	 *
+	 * To reduce latency, the bitbuffer is refilled and the next litlen
+	 * decode table entry is preloaded before each loop iteration.
+	 */
+	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+		goto generic_loop;
+	REFILL_BITS_IN_FASTLOOP();
+	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+	do {
+		u32 length, offset, lit;
+		const u8 *src;
+		u8 *dst;
+
+		/*
+		 * Consume the bits for the litlen decode table entry.  Save the
+		 * original bitbuf for later, in case the extra match length
+		 * bits need to be extracted from it.
+		 */
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+
+		/*
+		 * Begin by checking for a "fast" literal, i.e. a literal that
+		 * doesn't need a subtable.
+		 */
+		if (entry & HUFFDEC_LITERAL) {
+			/*
+			 * On 64-bit platforms, we decode up to 2 extra fast
+			 * literals in addition to the primary item, as this
+			 * increases performance and still leaves enough bits
+			 * remaining for what follows.  We could actually do 3,
+			 * assuming LITLEN_TABLEBITS=11, but that actually
+			 * decreases performance slightly (perhaps by messing
+			 * with the branch prediction of the conditional refill
+			 * that happens later while decoding the match offset).
+			 *
+			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+			 * number of extra literals decoded here is changed.
+			 */
+			if (/* enough bits for 2 fast literals + length + offset preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 LENGTH_MAXBITS,
+							 OFFSET_TABLEBITS) &&
+			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							 LITLEN_TABLEBITS)) {
+				/* 1st extra fast literal */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				saved_bitbuf = bitbuf;
+				bitbuf >>= (u8)entry;
+				bitsleft -= entry;
+				*out_next++ = lit;
+				if (entry & HUFFDEC_LITERAL) {
+					/* 2nd extra fast literal */
+					lit = entry >> 16;
+					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+					saved_bitbuf = bitbuf;
+					bitbuf >>= (u8)entry;
+					bitsleft -= entry;
+					*out_next++ = lit;
+					if (entry & HUFFDEC_LITERAL) {
+						/*
+						 * Another fast literal, but
+						 * this one is in lieu of the
+						 * primary item, so it doesn't
+						 * count as one of the extras.
+						 */
+						lit = entry >> 16;
+						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+						REFILL_BITS_IN_FASTLOOP();
+						*out_next++ = lit;
+						continue;
+					}
+				}
+			} else {
+				/*
+				 * Decode a literal.  While doing so, preload
+				 * the next litlen decode table entry and refill
+				 * the bitbuffer.  To reduce latency, we've
+				 * arranged for there to be enough "preloadable"
+				 * bits remaining to do the table preload
+				 * independently of the refill.
+				 */
+				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+		}
+
+		/*
+		 * It's not a literal entry, so it can be a length entry, a
+		 * subtable pointer entry, or an end-of-block entry.  Detect the
+		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+		 */
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			/* Subtable pointer or end-of-block entry */
+
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+
+			/*
+			 * A subtable is required.  Load and consume the
+			 * subtable entry.  The subtable entry can be of any
+			 * type: literal, length, or end-of-block.
+			 */
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+
+			/*
+			 * 32-bit platforms that use the byte-at-a-time refill
+			 * method have to do a refill here for there to always
+			 * be enough bits to decode a literal that requires a
+			 * subtable, then preload the next litlen decode table
+			 * entry; or to decode a match length that requires a
+			 * subtable, then preload the offset decode table entry.
+			 */
+			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							  LITLEN_TABLEBITS) ||
+			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+							  OFFSET_TABLEBITS))
+				REFILL_BITS_IN_FASTLOOP();
+			if (entry & HUFFDEC_LITERAL) {
+				/* Decode a literal that required a subtable. */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+			/* Else, it's a length that required a subtable. */
+		}
+
+		/*
+		 * Decode the match length: the length base value associated
+		 * with the litlen symbol (which we extract from the decode
+		 * table entry), plus the extra length bits.  We don't need to
+		 * consume the extra length bits here, as they were included in
+		 * the bits consumed by the entry earlier.  We also don't need
+		 * to check for too-long matches here, as this is inside the
+		 * fastloop where it's already been verified that the output
+		 * buffer has enough space remaining to copy a max-length match.
+		 */
+		length = entry >> 16;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/*
+		 * Decode the match offset.  There are enough "preloadable" bits
+		 * remaining to preload the offset decode table entry, but a
+		 * refill might be needed before consuming it.
+		 */
+		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+							   OFFSET_TABLEBITS));
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+						 LITLEN_TABLEBITS)) {
+			/*
+			 * Decoding a match offset on a 64-bit platform.  We may
+			 * need to refill once, but then we can decode the whole
+			 * offset and preload the next litlen table entry.
+			 */
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+					     LITLEN_TABLEBITS - PRELOAD_SLACK))
+					REFILL_BITS_IN_FASTLOOP();
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+					    LITLEN_TABLEBITS - PRELOAD_SLACK))
+				REFILL_BITS_IN_FASTLOOP();
+		} else {
+			/* Decoding a match offset on a 32-bit platform */
+			REFILL_BITS_IN_FASTLOOP();
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+				REFILL_BITS_IN_FASTLOOP();
+				/* No further refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(
+					OFFSET_MAXBITS - OFFSET_TABLEBITS));
+			} else {
+				/* No refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+			}
+		}
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/* Validate the match offset; needed even in the fastloop. */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		/*
+		 * Before starting to issue the instructions to copy the match,
+		 * refill the bitbuffer and preload the litlen decode table
+		 * entry for the next loop iteration.  This can increase
+		 * performance by allowing the latency of the match copy to
+		 * overlap with these other operations.  To further reduce
+		 * latency, we've arranged for there to be enough bits remaining
+		 * to do the table preload independently of the refill, except
+		 * on 32-bit platforms using the byte-at-a-time refill method.
+		 */
+		if (!CAN_CONSUME_AND_THEN_PRELOAD(
+			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+			    OFFSET_MAXFASTBITS),
+			LITLEN_TABLEBITS) &&
+		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+			REFILL_BITS_IN_FASTLOOP();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		REFILL_BITS_IN_FASTLOOP();
+
+		/*
+		 * Copy the match.  On most CPUs the fastest method is a
+		 * word-at-a-time copy, unconditionally copying about 5 words
+		 * since this is enough for most matches without being too much.
+		 *
+		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
+		 * which is most cases.  The case of offset == 1 is also common
+		 * and is worth optimizing for, since it is just RLE encoding of
+		 * the previous byte, which is the result of compressing long
+		 * runs of the same byte.
+		 *
+		 * Writing past the match 'length' is allowed here, since it's
+		 * been ensured there is enough output space left for a slight
+		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+		 * the maximum possible overrun here is changed.
+		 */
+		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+			machine_word_t v;
+
+			/*
+			 * This part tends to get auto-vectorized, so keep it
+			 * copying a multiple of 16 bytes at a time.
+			 */
+			v = (machine_word_t)0x0101010101010101 * src[0];
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			do {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+			} while (dst < out_next);
+		} else {
+			*dst++ = *src++;
+			*dst++ = *src++;
+			do {
+				*dst++ = *src++;
+			} while (dst < out_next);
+		}
+	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+	/*
+	 * This is the generic loop for decoding literals and matches.  This
+	 * handles cases where in_next and out_next are close to the end of
+	 * their respective buffers.  Usually this loop isn't performance-
+	 * critical, as most time is spent in the fastloop above instead.  We
+	 * therefore omit some optimizations here in favor of smaller code.
+	 */
+generic_loop:
+	for (;;) {
+		u32 length, offset;
+		const u8 *src;
+		u8 *dst;
+
+		REFILL_BITS();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+		}
+		length = entry >> 16;
+		if (entry & HUFFDEC_LITERAL) {
+			if (unlikely(out_next == out_end))
+				return LIBDEFLATE_INSUFFICIENT_SPACE;
+			*out_next++ = length;
+			continue;
+		}
+		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+			goto block_done;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+		if (unlikely(length > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+			REFILL_BITS();
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			bitbuf >>= OFFSET_TABLEBITS;
+			bitsleft -= OFFSET_TABLEBITS;
+			entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			if (!CAN_CONSUME(OFFSET_MAXBITS))
+				REFILL_BITS();
+		}
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+		*dst++ = *src++;
+		*dst++ = *src++;
+		do {
+			*dst++ = *src++;
+		} while (dst < out_next);
+	}
+
+block_done:
+	/* Finished decoding a block */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block. */
+
+	bitsleft = (u8)bitsleft;
+
+	/*
+	 * If any of the implicit appended zero bytes were consumed (not just
+	 * refilled) before hitting end of stream, then the data is bad.
+	 */
+	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+	/* Optionally return the actual number of bytes consumed. */
+	if (actual_in_nbytes_ret) {
+		/* Don't count bytes that were refilled but not consumed. */
+		in_next -= (bitsleft >> 3) - overread_count;
+
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+	}
+
+	/* Optionally return the actual number of bytes written. */
+	if (actual_out_nbytes_ret) {
+		*actual_out_nbytes_ret = out_next - (u8 *)out;
+	} else {
+		if (out_next != out_end)
+			return LIBDEFLATE_SHORT_OUTPUT;
+	}
+	return LIBDEFLATE_SUCCESS;
+}
+
+#undef FUNCNAME
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
+
+#endif /* HAVE_BMI2_INTRIN */
+
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL	deflate_decompress_bmi2
+#else
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+#ifdef deflate_decompress_bmi2
+	if (HAVE_BMI2(get_x86_cpu_features()))
+		return deflate_decompress_bmi2;
+#endif
+	return NULL;
+}
+#define arch_select_decompress_func	arch_select_decompress_func
+#endif
+
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */
+
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL deflate_decompress_default
+#endif
+
+#ifdef arch_select_decompress_func
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+static volatile decompress_func_t decompress_impl = dispatch_decomp;
+
+/* Choose the best implementation at runtime. */
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	decompress_func_t f = arch_select_decompress_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	decompress_impl = f;
+	return f(d, in, in_nbytes, out, out_nbytes_avail,
+		 actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#  define decompress_impl DEFAULT_IMPL
+#endif
+
+/*
+ * This is the main DEFLATE decompression routine.  See libdeflate.h for the
+ * documentation.
+ *
+ * Note that the real code is in decompress_template.h.  The part here just
+ * handles calling the appropriate implementation depending on the CPU features
+ * at runtime.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret)
+{
+	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
+			       actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+						out, out_nbytes_avail,
+						NULL, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
+{
+	struct libdeflate_decompressor *d;
+
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
+	d = (libdeflate_decompressor*)(options->malloc_func ? options->malloc_func :
+	     libdeflate_default_malloc_func)(sizeof(*d));
+	if (d == NULL)
+		return NULL;
+	/*
+	 * Note that only certain parts of the decompressor actually must be
+	 * initialized here:
+	 *
+	 * - 'static_codes_loaded' must be initialized to false.
+	 *
+	 * - The first half of the main portion of each decode table must be
+	 *   initialized to any value, to avoid reading from uninitialized
+	 *   memory during table expansion in build_decode_table().  (Although,
+	 *   this is really just to avoid warnings with dynamic tools like
+	 *   valgrind, since build_decode_table() is guaranteed to initialize
+	 *   all entries eventually anyway.)
+	 *
+	 * - 'free_func' must be set.
+	 *
+	 * But for simplicity, we currently just zero the whole decompressor.
+	 */
+	memset(d, 0, sizeof(*d));
+	d->free_func = options->free_func ?
+		       options->free_func : libdeflate_default_free_func;
+	return d;
+}
+
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void)
+{
+	static const struct libdeflate_options defaults = {
+		/*.sizeof_options = */sizeof(defaults),
+	};
+	return libdeflate_alloc_decompressor_ex(&defaults);
+}
+
+LIBDEFLATEAPI void
+libdeflate_free_decompressor(struct libdeflate_decompressor *d)
+{
+	if (d)
+		d->free_func(d);
+}
+
+
+/*
+ * utils.c - utility functions for libdeflate
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef FREESTANDING
+#  define malloc NULL
+#  define free NULL
+#else
+#  include <stdlib.h>
+#endif
+
+malloc_func_t libdeflate_default_malloc_func = malloc;
+free_func_t libdeflate_default_free_func = free;
+
+void *
+libdeflate_aligned_malloc(malloc_func_t malloc_func,
+			  size_t alignment, size_t size)
+{
+	void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size);
+
+	if (ptr) {
+		void *orig_ptr = ptr;
+
+		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+		((void **)ptr)[-1] = orig_ptr;
+	}
+	return ptr;
+}
+
+void
+libdeflate_aligned_free(free_func_t free_func, void *ptr)
+{
+	(*free_func)(((void **)ptr)[-1]);
+}
+
+LIBDEFLATEAPI void
+libdeflate_set_memory_allocator(malloc_func_t malloc_func,
+				free_func_t free_func)
+{
+	libdeflate_default_malloc_func = malloc_func;
+	libdeflate_default_free_func = free_func;
+}
+
+/*
+ * Implementations of libc functions for freestanding library builds.
+ * Normal library builds don't use these.  Not optimized yet; usually the
+ * compiler expands these functions and doesn't actually call them anyway.
+ */
+#ifdef FREESTANDING
+#undef memset
+void * __attribute__((weak))
+memset(void *s, int c, size_t n)
+{
+	u8 *p = s;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		p[i] = c;
+	return s;
+}
+
+#undef memcpy
+void * __attribute__((weak))
+memcpy(void *dest, const void *src, size_t n)
+{
+	u8 *d = dest;
+	const u8 *s = src;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		d[i] = s[i];
+	return dest;
+}
+
+#undef memmove
+void * __attribute__((weak))
+memmove(void *dest, const void *src, size_t n)
+{
+	u8 *d = dest;
+	const u8 *s = src;
+	size_t i;
+
+	if (d <= s)
+		return memcpy(d, s, n);
+
+	for (i = n; i > 0; i--)
+		d[i - 1] = s[i - 1];
+	return dest;
+}
+
+#undef memcmp
+int __attribute__((weak))
+memcmp(const void *s1, const void *s2, size_t n)
+{
+	const u8 *p1 = s1;
+	const u8 *p2 = s2;
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		if (p1[i] != p2[i])
+			return (int)p1[i] - (int)p2[i];
+	}
+	return 0;
+}
+#endif /* FREESTANDING */
+
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+#include <stdio.h>
+#include <stdlib.h>
+void
+libdeflate_assertion_failed(const char *expr, const char *file, int line)
+{
+	fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line);
+	abort();
+}
+#endif /* LIBDEFLATE_ENABLE_ASSERTIONS */
+
+/*
+ * x86/cpu_features.c - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if HAVE_DYNAMIC_X86_CPU_FEATURES
+
+/*
+ * With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
+ */
+#if defined(ARCH_X86_32) && defined(__PIC__)
+#  define EBX_CONSTRAINT "=&r"
+#else
+#  define EBX_CONSTRAINT "=b"
+#endif
+
+/* Execute the CPUID instruction. */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+#ifdef _MSC_VER
+	int result[4];
+
+	__cpuidex(result, leaf, subleaf);
+	*a = result[0];
+	*b = result[1];
+	*c = result[2];
+	*d = result[3];
+#else
+	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+			 "cpuid                                  \n"
+			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+			 : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+			 : "a" (leaf), "c" (subleaf));
+#endif
+}
+
+/* Read an extended control register. */
+static inline u64
+read_xcr(u32 index)
+{
+#ifdef _MSC_VER
+	return _xgetbv(index);
+#else
+	u32 d, a;
+
+	/*
+	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.
+	 *
+	 * This must be 'volatile' to prevent this code from being moved out
+	 * from under the check for OSXSAVE.
+	 */
+	__asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
+			 "=d" (d), "=a" (a) : "c" (index));
+
+	return ((u64)d << 32) | a;
+#endif
+}
+
+static const struct cpu_feature x86_cpu_feature_table[] = {
+	{X86_CPU_FEATURE_SSE2,		"sse2"},
+	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
+	{X86_CPU_FEATURE_AVX,		"avx"},
+	{X86_CPU_FEATURE_AVX2,		"avx2"},
+	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+};
+
+volatile u32 libdeflate_x86_cpu_features = 0;
+
+/* Initialize libdeflate_x86_cpu_features. */
+void libdeflate_init_x86_cpu_features(void)
+{
+	u32 max_leaf, a, b, c, d;
+	u64 xcr0 = 0;
+	u32 features = 0;
+
+	/* EAX=0: Highest Function Parameter and Manufacturer ID */
+	cpuid(0, 0, &max_leaf, &b, &c, &d);
+	if (max_leaf < 1)
+		goto out;
+
+	/* EAX=1: Processor Info and Feature Bits */
+	cpuid(1, 0, &a, &b, &c, &d);
+	if (d & (1 << 26))
+		features |= X86_CPU_FEATURE_SSE2;
+	if (c & (1 << 1))
+		features |= X86_CPU_FEATURE_PCLMUL;
+	if (c & (1 << 27))
+		xcr0 = read_xcr(0);
+	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_AVX;
+
+	if (max_leaf < 7)
+		goto out;
+
+	/* EAX=7, ECX=0: Extended Features */
+	cpuid(7, 0, &a, &b, &c, &d);
+	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_AVX2;
+	if (b & (1 << 8))
+		features |= X86_CPU_FEATURE_BMI2;
+
+out:
+	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
+					 ARRAY_LEN(x86_cpu_feature_table));
+
+	libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* HAVE_DYNAMIC_X86_CPU_FEATURES */
diff --git a/Source/ThirdParty/OpenFBX/libdeflate.h b/Source/ThirdParty/OpenFBX/libdeflate.h
new file mode 100644
index 000000000..382d895de
--- /dev/null
+++ b/Source/ThirdParty/OpenFBX/libdeflate.h
@@ -0,0 +1,411 @@
+/*
+ * libdeflate.h - public header for libdeflate
+ */
+
+#ifndef LIBDEFLATE_H
+#define LIBDEFLATE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBDEFLATE_VERSION_MAJOR	1
+#define LIBDEFLATE_VERSION_MINOR	18
+#define LIBDEFLATE_VERSION_STRING	"1.18"
+
+/*
+ * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
+ * __declspec(dllimport) to be used.  This should be done when it's easy to do.
+ * Otherwise it's fine to skip it, since it is a very minor performance
+ * optimization that is irrelevant for most use cases of libdeflate.
+ */
+#ifndef LIBDEFLATEAPI
+#  if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#    define LIBDEFLATEAPI	__declspec(dllimport)
+#  else
+#    define LIBDEFLATEAPI
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                             Compression                                    */
+/* ========================================================================== */
+
+struct libdeflate_compressor;
+struct libdeflate_options;
+
+/*
+ * libdeflate_alloc_compressor() allocates a new compressor that supports
+ * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
+ * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
+ * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
+ * "no compression", specifically "create a valid stream, but only emit
+ * uncompressed blocks" (this will expand the data slightly).
+ *
+ * The return value is a pointer to the new compressor, or NULL if out of memory
+ * or if the compression level is invalid (i.e. outside the range [0, 12]).
+ *
+ * Note: for compression, the sliding window size is defined at compilation time
+ * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
+ * changed at runtime.
+ *
+ * A single compressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different compressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level);
+
+/*
+ * Like libdeflate_alloc_compressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options);
+
+/*
+ * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
+ * data.  It attempts to compress 'in_nbytes' bytes of data located at 'in' and
+ * write the result to 'out', which has space for 'out_nbytes_avail' bytes.  The
+ * return value is the compressed size in bytes, or 0 if the data could not be
+ * compressed to 'out_nbytes_avail' bytes or fewer (but see note below).
+ *
+ * If compression is successful, then the output data is guaranteed to be a
+ * valid DEFLATE stream that decompresses to the input data.  No other
+ * guarantees are made about the output data.  Notably, different versions of
+ * libdeflate can produce different compressed data for the same uncompressed
+ * data, even at the same compression level.  Do ***NOT*** do things like
+ * writing tests that compare compressed data to a golden output, as this can
+ * break when libdeflate is updated.  (This property isn't specific to
+ * libdeflate; the same is true for zlib and other compression libraries too.)
+ */
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
+			    const void *in, size_t in_nbytes,
+			    void *out, size_t out_nbytes_avail);
+
+/*
+ * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
+ * number of bytes of compressed data that may be produced by compressing any
+ * buffer of length less than or equal to 'in_nbytes' using
+ * libdeflate_deflate_compress() with the specified compressor.  This bound will
+ * necessarily be a number greater than or equal to 'in_nbytes'.  It may be an
+ * overestimate of the true upper bound.  The return value is guaranteed to be
+ * the same for all invocations with the same compressor and same 'in_nbytes'.
+ *
+ * As a special case, 'compressor' may be NULL.  This causes the bound to be
+ * taken across *any* libdeflate_compressor that could ever be allocated with
+ * this build of the library, with any options.
+ *
+ * Note that this function is not necessary in many applications.  With
+ * block-based compression, it is usually preferable to separately store the
+ * uncompressed size of each block and to store any blocks that did not compress
+ * to less than their original size uncompressed.  In that scenario, there is no
+ * need to know the worst-case compressed size, since the maximum number of
+ * bytes of compressed data that may be used would always be one less than the
+ * input length.  You can just pass a buffer of that size to
+ * libdeflate_deflate_compress() and store the data uncompressed if
+ * libdeflate_deflate_compress() returns 0, indicating that the compressed data
+ * did not fit into the provided output buffer.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
+				  size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead
+ * of raw DEFLATE.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_zlib_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
+			       size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead
+ * of raw DEFLATE.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_gzip_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
+			       size_t in_nbytes);
+
+/*
+ * libdeflate_free_compressor() frees a compressor that was allocated with
+ * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
+ * taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_compressor(struct libdeflate_compressor *compressor);
+
+/* ========================================================================== */
+/*                             Decompression                                  */
+/* ========================================================================== */
+
+struct libdeflate_decompressor;
+struct libdeflate_options;
+
+/*
+ * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
+ * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
+ * the new decompressor, or NULL if out of memory.
+ *
+ * This function takes no parameters, and the returned decompressor is valid for
+ * decompressing data that was compressed at any compression level and with any
+ * sliding window size.
+ *
+ * A single decompressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different decompressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void);
+
+/*
+ * Like libdeflate_alloc_decompressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options);
+
+/*
+ * Result of a call to libdeflate_deflate_decompress(),
+ * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
+ */
+enum libdeflate_result {
+	/* Decompression was successful.  */
+	LIBDEFLATE_SUCCESS = 0,
+
+	/* Decompression failed because the compressed data was invalid,
+	 * corrupt, or otherwise unsupported.  */
+	LIBDEFLATE_BAD_DATA = 1,
+
+	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
+	 * decompressed to fewer than 'out_nbytes_avail' bytes.  */
+	LIBDEFLATE_SHORT_OUTPUT = 2,
+
+	/* The data would have decompressed to more than 'out_nbytes_avail'
+	 * bytes.  */
+	LIBDEFLATE_INSUFFICIENT_SPACE = 3,
+};
+
+/*
+ * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer
+ * 'in' with compressed size up to 'in_nbytes' bytes.  The uncompressed data is
+ * written to 'out', a buffer with size 'out_nbytes_avail' bytes.  If
+ * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.  Otherwise,
+ * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the
+ * contents of the output buffer are undefined.
+ *
+ * Decompression stops at the end of the DEFLATE stream (as indicated by the
+ * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
+ *
+ * libdeflate_deflate_decompress() can be used in cases where the actual
+ * uncompressed size is known (recommended) or unknown (not recommended):
+ *
+ *   - If the actual uncompressed size is known, then pass the actual
+ *     uncompressed size as 'out_nbytes_avail' and pass NULL for
+ *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
+ *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
+ *     specified number of bytes.
+ *
+ *   - If the actual uncompressed size is unknown, then provide a non-NULL
+ *     'actual_out_nbytes_ret' and provide a buffer with some size
+ *     'out_nbytes_avail' that you think is large enough to hold all the
+ *     uncompressed data.  In this case, if the data decompresses to less than
+ *     or equal to 'out_nbytes_avail' bytes, then
+ *     libdeflate_deflate_decompress() will write the actual uncompressed size
+ *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
+ *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
+ *     not large enough but no other problems were encountered, or another
+ *     nonzero result code if decompression failed for another reason.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
+ * then the actual compressed size of the DEFLATE stream (aligned to the next
+ * byte boundary) is written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
+ * instead of raw DEFLATE.
+ *
+ * Decompression will stop at the end of the zlib stream, even if it is shorter
+ * than 'in_nbytes'.  If you need to know exactly where the zlib stream ended,
+ * use libdeflate_zlib_decompress_ex().
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first zlib-compressed stream in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
+ * instead of raw DEFLATE.
+ *
+ * If multiple gzip-compressed members are concatenated, then only the first
+ * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
+ * multi-member support.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first gzip-compressed member in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * libdeflate_free_decompressor() frees a decompressor that was allocated with
+ * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
+ * is taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
+
+/* ========================================================================== */
+/*                                Checksums                                   */
+/* ========================================================================== */
+
+/*
+ * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
+ * data and returns the updated checksum.  When starting a new checksum, the
+ * required initial value for 'adler' is 1.  This value is also returned when
+ * 'buffer' is specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
+
+
+/*
+ * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
+ * and returns the updated checksum.  When starting a new checksum, the required
+ * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
+ * specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
+
+/* ========================================================================== */
+/*                           Custom memory allocator                          */
+/* ========================================================================== */
+
+/*
+ * Install a custom memory allocator which libdeflate will use for all memory
+ * allocations by default.  'malloc_func' is a function that must behave like
+ * malloc(), and 'free_func' is a function that must behave like free().
+ *
+ * The per-(de)compressor custom memory allocator that can be specified in
+ * 'struct libdeflate_options' takes priority over this.
+ *
+ * This doesn't affect the free() function that will be used to free
+ * (de)compressors that were already in existence when this is called.
+ */
+LIBDEFLATEAPI void
+libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
+				void (*free_func)(void *));
+
+/*
+ * Advanced options.  This is the options structure that
+ * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex()
+ * require.  Most users won't need this and should just use the non-"_ex"
+ * functions instead.  If you do need this, it should be initialized like this:
+ *
+ *	struct libdeflate_options options;
+ *
+ *	memset(&options, 0, sizeof(options));
+ *	options.sizeof_options = sizeof(options);
+ *	// Then set the fields that you need to override the defaults for.
+ */
+struct libdeflate_options {
+
+	/*
+	 * This field must be set to the struct size.  This field exists for
+	 * extensibility, so that fields can be appended to this struct in
+	 * future versions of libdeflate while still supporting old binaries.
+	 */
+	size_t sizeof_options;
+
+	/*
+	 * An optional custom memory allocator to use for this (de)compressor.
+	 * 'malloc_func' must be a function that behaves like malloc(), and
+	 * 'free_func' must be a function that behaves like free().
+	 *
+	 * This is useful in cases where a process might have multiple users of
+	 * libdeflate who want to use different memory allocators.  For example,
+	 * a library might want to use libdeflate with a custom memory allocator
+	 * without interfering with user code that might use libdeflate too.
+	 *
+	 * This takes priority over the "global" memory allocator (which by
+	 * default is malloc() and free(), but can be changed by
+	 * libdeflate_set_memory_allocator()).  Moreover, libdeflate will never
+	 * call the "global" memory allocator if a per-(de)compressor custom
+	 * allocator is always given.
+	 */
+	void *(*malloc_func)(size_t);
+	void (*free_func)(void *);
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBDEFLATE_H */
diff --git a/Source/ThirdParty/OpenFBX/ofbx.cpp b/Source/ThirdParty/OpenFBX/ofbx.cpp
index e60211e3d..67f34bfeb 100644
--- a/Source/ThirdParty/OpenFBX/ofbx.cpp
+++ b/Source/ThirdParty/OpenFBX/ofbx.cpp
@@ -1,5 +1,5 @@
 #include "ofbx.h"
-#include "miniz.h"
+#include "libdeflate.h"
 #include <cassert>
 #include <math.h>
 #include <ctype.h>
@@ -8,11 +8,39 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <mutex>
+#include <inttypes.h>
+#include <string.h>
 
+#if __cplusplus >= 202002L && defined(__cpp_lib_bit_cast)
+#include <bit> // for std::bit_cast (C++20 and later)
+#endif
+#include <map>
 
 namespace ofbx
 {
 
+template<typename T> static T read_value(const u8* value_ptr) {
+	T value;
+	memcpy(&value, value_ptr, sizeof(T));
+	return value;
+}
+
+static int decodeIndex(int idx)
+{
+	return (idx < 0) ? (-idx - 1) : idx;
+}
+
+static int codeIndex(int idx, bool last)
+{
+	return last ? (-idx - 1) : idx;
+}
+
+template <typename T>
+static T& emplace_back(std::vector<T>& vec) {
+	vec.emplace_back();
+	return vec.back();
+}
 
 struct Allocator {
 	struct Page {
@@ -25,11 +53,12 @@ struct Allocator {
 	Page* first = nullptr;
 
 	~Allocator() {
-        while (first) {
-            Page* page = first;
-            first = first->header.next;
-            delete page;
-        }
+		Page* p = first;
+		while (p) {
+			Page* n = p->header.next;
+			delete p;
+			p = n;
+		}
 	}
 
 	template <typename T, typename... Args> T* allocate(Args&&... args)
@@ -42,7 +71,7 @@ struct Allocator {
 		if (p->header.offset % alignof(T) != 0) {
 			p->header.offset += alignof(T) - p->header.offset % alignof(T);
 		}
-		
+
 		if (p->header.offset + sizeof(T) > sizeof(p->data)) {
 			p = new Page;
 			p->header.next = first;
@@ -52,37 +81,35 @@ struct Allocator {
 		p->header.offset += sizeof(T);
 		return res;
 	}
-
-	// store temporary data, can be reused
-	std::vector<float> tmp;
-	std::vector<int> int_tmp;
-	std::vector<Vec3> vec3_tmp;
-	std::vector<double> double_tmp;
-	std::vector<Vec3> vec3_tmp2;
-};
-
-
-struct Temporaries {
-	std::vector<float> f;
-	std::vector<int> i;
-	std::vector<Vec2> v2;
-	std::vector<Vec3> v3;
-	std::vector<Vec4> v4;
 };
 
 
 struct Video
 {
+	IElementProperty* base64_property = nullptr;
 	DataView filename;
 	DataView content;
 	DataView media;
+	bool is_base_64;
 };
 
 
 struct Error
 {
 	Error() {}
-	Error(const char* msg) { s_message = msg; }
+	Error(const char* msg)
+	{
+		s_message = msg;
+	}
+
+	// Format a message with printf-style arguments.
+	template <typename... Args>
+	Error(const char* fmt, Args... args) 
+	{
+		char buf[1024];
+		std::snprintf(buf, sizeof(buf), fmt, args...);
+		s_message = buf;
+	}
 
 	static const char* s_message;
 };
@@ -151,7 +178,7 @@ struct Cursor
 };
 
 
-static void setTranslation(const Vec3& t, Matrix* mtx)
+static void setTranslation(const DVec3& t, DMatrix* mtx)
 {
 	mtx->m[12] = t.x;
 	mtx->m[13] = t.y;
@@ -159,15 +186,15 @@ static void setTranslation(const Vec3& t, Matrix* mtx)
 }
 
 
-static Vec3 operator-(const Vec3& v)
+static DVec3 operator-(const DVec3& v)
 {
 	return {-v.x, -v.y, -v.z};
 }
 
 
-static Matrix operator*(const Matrix& lhs, const Matrix& rhs)
+static DMatrix operator*(const DMatrix& lhs, const DMatrix& rhs)
 {
-	Matrix res;
+	DMatrix res;
 	for (int j = 0; j < 4; ++j)
 	{
 		for (int i = 0; i < 4; ++i)
@@ -184,15 +211,15 @@ static Matrix operator*(const Matrix& lhs, const Matrix& rhs)
 }
 
 
-static Matrix makeIdentity()
+static DMatrix makeIdentity()
 {
 	return {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
 }
 
 
-static Matrix rotationX(double angle)
+static DMatrix rotationX(double angle)
 {
-	Matrix m = makeIdentity();
+	DMatrix m = makeIdentity();
 	double c = cos(angle);
 	double s = sin(angle);
 
@@ -204,9 +231,9 @@ static Matrix rotationX(double angle)
 }
 
 
-static Matrix rotationY(double angle)
+static DMatrix rotationY(double angle)
 {
-	Matrix m = makeIdentity();
+	DMatrix m = makeIdentity();
 	double c = cos(angle);
 	double s = sin(angle);
 
@@ -218,9 +245,9 @@ static Matrix rotationY(double angle)
 }
 
 
-static Matrix rotationZ(double angle)
+static DMatrix rotationZ(double angle)
 {
-	Matrix m = makeIdentity();
+	DMatrix m = makeIdentity();
 	double c = cos(angle);
 	double s = sin(angle);
 
@@ -232,12 +259,12 @@ static Matrix rotationZ(double angle)
 }
 
 
-static Matrix getRotationMatrix(const Vec3& euler, RotationOrder order)
+static DMatrix getRotationMatrix(const DVec3& euler, RotationOrder order)
 {
 	const double TO_RAD = 3.1415926535897932384626433832795028 / 180.0;
-	Matrix rx = rotationX(euler.x * TO_RAD);
-	Matrix ry = rotationY(euler.y * TO_RAD);
-	Matrix rz = rotationZ(euler.z * TO_RAD);
+	DMatrix rx = rotationX(euler.x * TO_RAD);
+	DMatrix ry = rotationY(euler.y * TO_RAD);
+	DMatrix rz = rotationZ(euler.z * TO_RAD);
 	switch (order)
 	{
 		default:
@@ -264,13 +291,18 @@ i64 secondsToFbxTime(double value)
 }
 
 
-static Vec3 operator*(const Vec3& v, float f)
+static DVec3 operator*(const DVec3& v, float f)
 {
 	return {v.x * f, v.y * f, v.z * f};
 }
 
 
-static Vec3 operator+(const Vec3& a, const Vec3& b)
+static DVec3 operator+(const DVec3& a, const DVec3& b)
+{
+	return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static FVec3 operator+(const FVec3& a, const FVec3& b)
 {
 	return {a.x + b.x, a.y + b.y, a.z + b.z};
 }
@@ -300,7 +332,9 @@ u64 DataView::toU64() const
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(u64));
-		return *(u64*)begin;
+		u64 result;
+		memcpy(&result, begin, sizeof(u64));
+		return result;
 	}
 	static_assert(sizeof(unsigned long long) >= sizeof(u64), "can't use strtoull");
 	return strtoull((const char*)begin, nullptr, 10);
@@ -312,7 +346,9 @@ i64 DataView::toI64() const
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(i64));
-		return *(i64*)begin;
+		i64 result;
+		memcpy(&result, begin, sizeof(i64));
+		return result;
 	}
 	static_assert(sizeof(long long) >= sizeof(i64), "can't use atoll");
 	return atoll((const char*)begin);
@@ -324,7 +360,9 @@ int DataView::toInt() const
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(int));
-		return *(int*)begin;
+		int result;
+		memcpy(&result, begin, sizeof(int));
+		return result;
 	}
 	return atoi((const char*)begin);
 }
@@ -335,18 +373,27 @@ u32 DataView::toU32() const
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(u32));
-		return *(u32*)begin;
+		u32 result;
+		memcpy(&result, begin, sizeof(u32));
+		return result;
 	}
 	return (u32)atoll((const char*)begin);
 }
 
+bool DataView::toBool() const
+{
+	return toInt() != 0;
+}
+
 
 double DataView::toDouble() const
 {
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(double));
-		return *(double*)begin;
+		double result;
+		memcpy(&result, begin, sizeof(double));
+		return result;
 	}
 	return atof((const char*)begin);
 }
@@ -357,7 +404,9 @@ float DataView::toFloat() const
 	if (is_binary)
 	{
 		assert(end - begin == sizeof(float));
-		return *(float*)begin;
+		float result;
+		memcpy(&result, begin, sizeof(float));
+		return result;
 	}
 	return (float)atof((const char*)begin);
 }
@@ -374,15 +423,33 @@ bool DataView::operator==(const char* rhs) const
 		++c;
 		++c2;
 	}
-	return c2 == (const char*)end && *c == '\0';
+	return (*c2 == '\0' || c2 == (const char*)end) && *c == '\0';
 }
 
 
 struct Property;
-template <typename T> static bool parseArrayRaw(const Property& property, T* out, int max_size);
-template <typename T> static bool parseBinaryArray(const Property& property, std::vector<T>* out);
+struct Element;
+
+template <typename T> static bool parseMemory(const Property& property, T* out, int max_size_bytes);
+template <typename T> static bool parseVecData(Property& property, std::vector<T>* out_vec);
+template <typename T> static bool parseVertexData(const Element& element, const char* name, const char* index_name, T& out, std::vector<struct ParseDataJob>& jobs);
 static bool parseDouble(Property& property, double* out);
 
+struct ParseDataJob {
+	using F = bool (*)(Property*, void*);
+	Property* property = nullptr;
+	void* data = nullptr;
+	bool error = false;
+	F f;
+};
+
+template <typename T> [[nodiscard]] bool pushJob(std::vector<ParseDataJob>& jobs, Property& prop, std::vector<T>& data) {
+	ParseDataJob& job = emplace_back(jobs);
+	job.property = &prop;
+	job.data = (void*)&data;
+	job.f = [](Property* prop, void* data){ return parseVecData(*prop, (std::vector<T>*)data); };
+	return true;
+}
 
 struct Property : IElementProperty
 {
@@ -394,20 +461,22 @@ struct Property : IElementProperty
 		assert(type == ARRAY_DOUBLE || type == ARRAY_INT || type == ARRAY_FLOAT || type == ARRAY_LONG);
 		if (value.is_binary)
 		{
-			return int(*(u32*)value.begin);
+			int i;
+			memcpy(&i, value.begin, sizeof(i));
+			return i;
 		}
 		return count;
 	}
 
-	bool getValues(double* values, int max_size) const override { return parseArrayRaw(*this, values, max_size); }
+	bool getValues(double* values, int max_size) const override { return parseMemory(*this, values, max_size); }
 
-	bool getValues(float* values, int max_size) const override { return parseArrayRaw(*this, values, max_size); }
+	bool getValues(float* values, int max_size) const override { return parseMemory(*this, values, max_size); }
 
-	bool getValues(u64* values, int max_size) const override { return parseArrayRaw(*this, values, max_size); }
+	bool getValues(u64* values, int max_size) const override { return parseMemory(*this, values, max_size); }
 
-	bool getValues(i64* values, int max_size) const override { return parseArrayRaw(*this, values, max_size); }
+	bool getValues(i64* values, int max_size) const override { return parseMemory(*this, values, max_size); }
 
-	bool getValues(int* values, int max_size) const override { return parseArrayRaw(*this, values, max_size); }
+	bool getValues(int* values, int max_size) const override { return parseMemory(*this, values, max_size); }
 
 	int count = 0;
 	u8 type = INTEGER;
@@ -415,7 +484,6 @@ struct Property : IElementProperty
 	Property* next = nullptr;
 };
 
-
 struct Element : IElement
 {
 	IElement* getFirstChild() const override { return child; }
@@ -452,10 +520,15 @@ static const Element* findChild(const Element& element, const char* id)
 }
 
 
-static IElement* resolveProperty(const Object& obj, const char* name)
+static IElement* resolveProperty(const Object& obj, const char* name, bool* is_p60)
 {
+	*is_p60 = false;
 	const Element* props = findChild((const Element&)obj.element, "Properties70");
-	if (!props) return nullptr;
+	if (!props) {
+		props = findChild((const Element&)obj.element, "Properties60");
+		*is_p60 = true;
+		if (!props) return nullptr;
+	}
 
 	Element* prop = props->child;
 	while (prop)
@@ -472,66 +545,54 @@ static IElement* resolveProperty(const Object& obj, const char* name)
 
 static int resolveEnumProperty(const Object& object, const char* name, int default_value)
 {
-	Element* element = (Element*)resolveProperty(object, name);
+	bool is_p60;
+	Element* element = (Element*)resolveProperty(object, name, &is_p60);
 	if (!element) return default_value;
-	Property* x = (Property*)element->getProperty(4);
+	Property* x = (Property*)element->getProperty(is_p60 ? 3 : 4);
 	if (!x) return default_value;
 
 	return x->value.toInt();
 }
 
 
-static Vec3 resolveVec3Property(const Object& object, const char* name, const Vec3& default_value)
+static DVec3 resolveVec3Property(const Object& object, const char* name, const DVec3& default_value)
 {
-	Element* element = (Element*)resolveProperty(object, name);
+	bool is_p60;
+	Element* element = (Element*)resolveProperty(object, name, &is_p60);
 	if (!element) return default_value;
-	Property* x = (Property*)element->getProperty(4);
+	Property* x = (Property*)element->getProperty(is_p60 ? 3 : 4);
 	if (!x || !x->next || !x->next->next) return default_value;
 
 	return {x->value.toDouble(), x->next->value.toDouble(), x->next->next->value.toDouble()};
 }
 
-
-Object::Object(const Scene& _scene, const IElement& _element)
-	: scene(_scene)
-	, element(_element)
-	, is_node(false)
-	, node_attribute(nullptr)
+static bool isString(const Property* prop)
 {
-	auto& e = (Element&)_element;
-	if (e.first_property && e.first_property->next)
-	{
-		e.first_property->next->value.toString(name);
-	}
-	else
-	{
-		name[0] = '\0';
-	}
+	if (!prop) return false;
+	return prop->getType() == Property::STRING;
 }
 
 
+static bool isLong(const Property* prop)
+{
+	if (!prop) return false;
+	return prop->getType() == Property::LONG;
+}
+
 static bool decompress(const u8* in, size_t in_size, u8* out, size_t out_size)
 {
-	mz_stream stream = {};
-	mz_inflateInit(&stream);
-
-	stream.avail_in = (int)in_size;
-	stream.next_in = in;
-	stream.avail_out = (int)out_size;
-	stream.next_out = out;
-
-	int status = mz_inflate(&stream, Z_SYNC_FLUSH);
-
-	if (status != Z_STREAM_END) return false;
-
-	return mz_inflateEnd(&stream) == Z_OK;
+	auto dec = libdeflate_alloc_decompressor();
+	size_t dummy;
+	bool res = libdeflate_deflate_decompress(dec, in + 2, in_size - 2, out, out_size, &dummy) == LIBDEFLATE_SUCCESS;
+	libdeflate_free_decompressor(dec);
+	return res;
 }
 
 
 template <typename T> static OptionalError<T> read(Cursor* cursor)
 {
 	if (cursor->current + sizeof(T) > cursor->end) return Error("Reading past the end");
-	T value = *(const T*)cursor->current;
+	T value = read_value<T>(cursor->current);
 	cursor->current += sizeof(T);
 	return value;
 }
@@ -568,6 +629,20 @@ static OptionalError<DataView> readLongString(Cursor* cursor)
 	return value;
 }
 
+// 	Cheat sheet: //
+/*
+'S': Long string
+'Y': 16-bit signed integer
+'C': 8-bit signed integer
+'I': 32-bit signed integer
+'F': Single precision floating-point number
+'D': Double precision floating-point number
+'L': 64-bit signed integer
+'R': Binary data
+'b', 'f', 'd', 'l', 'c' and 'i': Arrays of binary data
+
+Src: https://code.blender.org/2013/08/fbx-binary-file-format-specification/
+*/
 
 static OptionalError<Property*> readProperty(Cursor* cursor, Allocator& allocator)
 {
@@ -603,6 +678,7 @@ static OptionalError<Property*> readProperty(Cursor* cursor, Allocator& allocato
 			break;
 		}
 		case 'b':
+		case 'c':
 		case 'f':
 		case 'd':
 		case 'l':
@@ -616,14 +692,18 @@ static OptionalError<Property*> readProperty(Cursor* cursor, Allocator& allocato
 			cursor->current += comp_len.getValue();
 			break;
 		}
-		default: return Error("Unknown property type");
+		default:
+		{
+			char str[32];
+			snprintf(str, sizeof(str), "Unknown property type: %c", prop->type);
+			return Error(str);
+		}
 	}
 	prop->value.end = cursor->current;
 	return prop;
 }
 
-
-static OptionalError<u64> readElementOffset(Cursor* cursor, u16 version)
+static OptionalError<u64> readElementOffset(Cursor* cursor, u32 version)
 {
 	if (version >= 7500)
 	{
@@ -685,7 +765,7 @@ static OptionalError<Element*> readElement(Cursor* cursor, u32 version, Allocato
 		}
 
 		*link = child.getValue();
-        if (child.getValue() == 0) break;
+		if (child.getValue() == 0) break;
 		link = &(*link)->sibling;
 	}
 
@@ -701,13 +781,14 @@ static OptionalError<Element*> readElement(Cursor* cursor, u32 version, Allocato
 
 static bool isEndLine(const Cursor& cursor)
 {
-	return *cursor.current == '\n';
+	return (*cursor.current == '\n')
+    	|| (*cursor.current == '\r' && cursor.current + 1 < cursor.end && *(cursor.current + 1) != '\n');
 }
 
 
 static void skipInsignificantWhitespaces(Cursor* cursor)
 {
-	while (cursor->current < cursor->end && isspace(*cursor->current) && *cursor->current != '\n')
+	while (cursor->current < cursor->end && isspace(*cursor->current) && !isEndLine(*cursor))
 	{
 		++cursor->current;
 	}
@@ -824,6 +905,14 @@ static OptionalError<Property*> readTextProperty(Cursor* cursor, Allocator& allo
 		return prop;
 	}
 
+	if (*cursor->current == ',') {
+		// https://github.com/nem0/OpenFBX/issues/85
+		prop->type = IElementProperty::NONE;
+		prop->value.begin = cursor->current;
+		prop->value.end = cursor->current;
+		return prop;
+	}
+
 	if (*cursor->current == '*')
 	{
 		prop->type = 'l';
@@ -845,7 +934,7 @@ static OptionalError<Property*> readTextProperty(Cursor* cursor, Allocator& allo
 				if (is_any) ++prop->count;
 				is_any = false;
 			}
-			else if (!isspace(*cursor->current) && *cursor->current != '\n')
+			else if (!isspace(*cursor->current) && !isEndLine(*cursor))
 				is_any = true;
 			if (*cursor->current == '.') prop->type = 'd';
 			++cursor->current;
@@ -857,7 +946,7 @@ static OptionalError<Property*> readTextProperty(Cursor* cursor, Allocator& allo
 	}
 
 	assert(false);
-	return Error("TODO");
+	return Error("Unknown error");
 }
 
 
@@ -875,7 +964,7 @@ static OptionalError<Element*> readTextElement(Cursor* cursor, Allocator& alloca
 	element->id = id;
 
 	Property** prop_link = &element->first_property;
-	while (cursor->current < cursor->end && *cursor->current != '\n' && *cursor->current != '{')
+	while (cursor->current < cursor->end && !isEndLine(*cursor) && *cursor->current != '{')
 	{
 		OptionalError<Property*> prop = readTextProperty(cursor, allocator);
 		if (prop.isError())
@@ -954,15 +1043,23 @@ static OptionalError<Element*> tokenizeText(const u8* data, size_t size, Allocat
 }
 
 
-static OptionalError<Element*> tokenize(const u8* data, size_t size, u32& version, Allocator& allocator)
-{
+static OptionalError<Element*> tokenize(const u8* data, size_t size, u32& version, Allocator& allocator) {
+	if (size < sizeof(Header)) return Error("Invalid header");
+
 	Cursor cursor;
 	cursor.begin = data;
 	cursor.current = data;
 	cursor.end = data + size;
 
-	const Header* header = (const Header*)cursor.current;
-	cursor.current += sizeof(*header);
+#if __cplusplus >= 202002L && defined(__cpp_lib_bit_cast)
+	const Header* header = std::bit_cast<const Header*>(cursor.current);
+#else
+	Header header_temp;
+	memcpy(&header_temp, cursor.current, sizeof(Header));
+	const Header* header = &header_temp;
+#endif
+
+	cursor.current += sizeof(Header);
 	version = header->version;
 
 	Element* root = allocator.allocate<Element>();
@@ -976,16 +1073,17 @@ static OptionalError<Element*> tokenize(const u8* data, size_t size, u32& versio
 	for (;;)
 	{
 		OptionalError<Element*> child = readElement(&cursor, header->version, allocator);
-		if (child.isError()) {
+		if (child.isError())
+		{
 			return Error();
 		}
+
 		*element = child.getValue();
 		if (!*element) return root;
 		element = &(*element)->sibling;
 	}
 }
 
-
 static void parseTemplates(const Element& root)
 {
 	const Element* defs = findChild(root, "Definitions");
@@ -1019,12 +1117,204 @@ static void parseTemplates(const Element& root)
 
 struct Scene;
 
+enum class VertexDataMapping {
+	BY_POLYGON_VERTEX,
+	BY_POLYGON,
+	BY_VERTEX
+};
+
+struct Vec2AttributesImpl {
+	std::vector<Vec2> values;
+	std::vector<int> indices;
+	VertexDataMapping mapping;
+	operator Vec2Attributes() const {
+		return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()) };
+	}
+};
+
+struct Vec3AttributesImpl {
+	std::vector<Vec3> values;
+	std::vector<int> indices;
+	VertexDataMapping mapping;
+	operator Vec3Attributes() const {
+		return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()), int(values.size()) };
+	}
+};
+
+struct Vec4AttributesImpl {
+	std::vector<Vec4> values;
+	std::vector<int> indices;
+	VertexDataMapping mapping;
+	operator Vec4Attributes() const {
+		return { values.data(), indices.data(), int(indices.empty() ? values.size() : indices.size()) };
+	}
+};
+
+struct GeometryPartitionImpl {
+	std::vector<GeometryPartition::Polygon> polygons;
+	int max_polygon_triangles = 0;
+	int triangles_count = 0;
+};
+
+struct GeometryDataImpl : GeometryData {
+	Vec3AttributesImpl positions;
+	Vec3AttributesImpl normals;
+	Vec3AttributesImpl tangents;
+	Vec4AttributesImpl colors;
+	Vec2AttributesImpl uvs[Geometry::s_uvs_max];
+	std::vector<GeometryPartitionImpl> partitions;
+	
+	std::vector<int> materials;
+
+	template <typename T, typename S>
+	T patchAttributes(const S& attr) const {
+		T res = attr;
+		if (!attr.values.empty() && attr.mapping == VertexDataMapping::BY_VERTEX && attr.indices.empty()) {
+			res.indices = positions.indices.data();
+			res.count = int(positions.indices.size());
+		}
+		return res;
+	}
+
+	Vec3Attributes getPositions() const override { return positions; }
+	Vec3Attributes getNormals() const override { return patchAttributes<Vec3Attributes>(normals); }
+	Vec2Attributes getUVs(int index) const override { return patchAttributes<Vec2Attributes>(uvs[index]); }
+	Vec4Attributes getColors() const override { return patchAttributes<Vec4Attributes>(colors); }
+	Vec3Attributes getTangents() const override { return patchAttributes<Vec3Attributes>(tangents); }
+	int getPartitionCount() const override { return (int)partitions.size(); }
+	
+	GeometryPartition getPartition(int index) const override { 
+		if (index >= partitions.size()) return {nullptr, 0, 0, 0};
+		return {
+			partitions[index].polygons.data(),
+			int(partitions[index].polygons.size()),
+			partitions[index].max_polygon_triangles,
+			partitions[index].triangles_count
+		};
+	}
+
+	template <typename T>
+	bool postprocess(T& attr) {
+		if (attr.values.empty()) return true;
+		if (attr.mapping == VertexDataMapping::BY_VERTEX && !attr.indices.empty()) {
+			if (positions.indices.empty()) return false; // not supported
+
+			std::vector<int> remapped;
+			attr.mapping = VertexDataMapping::BY_POLYGON_VERTEX;
+			remapped.resize(positions.indices.size());
+			for (int i = 0; i < remapped.size(); ++i) {
+				remapped[i] = attr.indices[decodeIndex(positions.indices[i])];
+			}
+			attr.indices = remapped;
+		}
+		else if (attr.mapping == VertexDataMapping::BY_POLYGON) {
+			if (!attr.indices.empty()) return false; // not supported
+			if (partitions.size() != 1) return false; // not supported
+			if (partitions[0].polygons.size() != attr.values.size()) return false; // invalid
+
+			std::vector<int> remapped;
+			attr.mapping = VertexDataMapping::BY_POLYGON_VERTEX;
+			remapped.resize(positions.indices.size());
+
+			for (int i = 0, c = (int)partitions[0].polygons.size(); i < c; ++i) {
+				GeometryPartition::Polygon& polygon = partitions[0].polygons[i];
+				for (int j = polygon.from_vertex; j < polygon.from_vertex + polygon.vertex_count; ++j) {
+					remapped[j] = i;
+				}
+			}
+			attr.indices = remapped;
+		}
+		return true;
+	}
+
+	bool postprocess() {
+		if (materials.empty()) {
+			GeometryPartitionImpl& partition = emplace_back(partitions);
+			int polygon_count = 0;
+			for (int i : positions.indices) {
+				if (i < 0) ++polygon_count;
+			}
+			partition.polygons.reserve(polygon_count);
+			int polygon_start = 0;
+			int max_polygon_triangles = 0;
+			int total_triangles = 0;
+			int* indices = positions.indices.data();
+			for (int i = 0, c = (int)positions.indices.size(); i < c; ++i) {
+				if (indices[i] < 0) {
+					int vertex_count = i - polygon_start + 1;
+					if (vertex_count > 2) {
+						partition.polygons.push_back({polygon_start, vertex_count});
+						indices[i] = -indices[i] - 1;
+						int triangles = vertex_count - 2;
+						total_triangles += triangles;
+						if (triangles > max_polygon_triangles) max_polygon_triangles = triangles;
+					}
+					polygon_start = i + 1;
+				}
+			}
+			partition.max_polygon_triangles = max_polygon_triangles;
+			partition.triangles_count = total_triangles;
+		}
+		else {
+			int max_partition = 0;
+			for (int m : materials) {
+				if (m > max_partition) max_partition = m;
+			}
+			partitions.resize(max_partition + 1);
+
+			u32 polygon_idx = 0;
+			int* indices = positions.indices.data();
+			int num_polygon_vertices = 0;
+			int polygon_start = 0;
+			for (int i = 0, c = (int)positions.indices.size(); i < c; ++i) {
+				++num_polygon_vertices;
+				if (indices[i] < 0) {
+					u32 material_index = materials[polygon_idx];
+					GeometryPartitionImpl& partition = partitions[material_index];
+					partition.polygons.push_back({polygon_start, num_polygon_vertices});
+
+					int triangles = num_polygon_vertices - 2;
+					partition.triangles_count += triangles;
+					if (triangles > partition.max_polygon_triangles) partition.max_polygon_triangles = triangles;
+					
+					indices[i] = -indices[i] - 1;
+
+					polygon_start = i + 1;
+					++polygon_idx;
+					num_polygon_vertices = 0;
+				}
+			}
+		}
+
+		postprocess(normals);
+		postprocess(tangents);
+		for (Vec2AttributesImpl& uv : uvs) postprocess(uv);
+		postprocess(colors);
+
+		return true;
+	}
+};
+
 
 Mesh::Mesh(const Scene& _scene, const IElement& _element)
 	: Object(_scene, _element)
 {
 }
 
+struct GeometryImpl : Geometry, GeometryDataImpl {
+	const Skin* skin = nullptr;
+	const BlendShape* blendShape = nullptr;
+
+	GeometryImpl(const Scene& _scene, const IElement& _element)
+		: Geometry(_scene, _element)
+	{
+	}
+
+	Type getType() const override { return Type::GEOMETRY; }
+	const GeometryData& getGeometryData() const override { return *this; }
+	const Skin* getSkin() const override { return skin; }
+	const BlendShape* getBlendShape() const override { return blendShape; }
+};
 
 struct MeshImpl : Mesh
 {
@@ -1035,35 +1325,41 @@ struct MeshImpl : Mesh
 	}
 
 
-	Matrix getGeometricMatrix() const override
+	DMatrix getGeometricMatrix() const override
 	{
-		Vec3 translation = resolveVec3Property(*this, "GeometricTranslation", {0, 0, 0});
-		Vec3 rotation = resolveVec3Property(*this, "GeometricRotation", {0, 0, 0});
-		Vec3 scale = resolveVec3Property(*this, "GeometricScaling", {1, 1, 1});
+		DVec3 translation = resolveVec3Property(*this, "GeometricTranslation", {0, 0, 0});
+		DVec3 rotation = resolveVec3Property(*this, "GeometricRotation", {0, 0, 0});
+		DVec3 scale = resolveVec3Property(*this, "GeometricScaling", {1, 1, 1});
 
-		Matrix scale_mtx = makeIdentity();
+		DMatrix scale_mtx = makeIdentity();
 		scale_mtx.m[0] = (float)scale.x;
 		scale_mtx.m[5] = (float)scale.y;
 		scale_mtx.m[10] = (float)scale.z;
-		Matrix mtx = getRotationMatrix(rotation, RotationOrder::EULER_XYZ);
+		DMatrix mtx = getRotationMatrix(rotation, RotationOrder::EULER_XYZ);
 		setTranslation(translation, &mtx);
 
 		return scale_mtx * mtx;
 	}
 
-
 	Type getType() const override { return Type::MESH; }
 
-
 	const Pose* getPose() const override { return pose; }
 	const Geometry* getGeometry() const override { return geometry; }
 	const Material* getMaterial(int index) const override { return materials[index]; }
 	int getMaterialCount() const override { return (int)materials.size(); }
 
+	const GeometryData& getGeometryData() const override { return geometry ? static_cast<const GeometryData&>(*geometry) : geometry_data; }
+	const Skin* getSkin() const override { return geometry ? geometry->getSkin() : skin; }
+	const BlendShape* getBlendShape() const override { return geometry ? geometry->getBlendShape() : blendShape; }
 
 	const Pose* pose = nullptr;
-	const Geometry* geometry = nullptr;
+	const GeometryImpl* geometry = nullptr;
 	std::vector<const Material*> materials;
+	const Skin* skin = nullptr;
+	const BlendShape* blendShape = nullptr;
+
+	// old formats do not use Geometry nodes but embed vertex data directly in Mesh
+	GeometryDataImpl geometry_data;
 };
 
 
@@ -1086,34 +1382,34 @@ struct MaterialImpl : Material
 	const Texture* getTexture(Texture::TextureType type) const override { return textures[type]; }
 	Color getDiffuseColor() const override { return diffuse_color; }
 	Color getSpecularColor() const override { return specular_color; }
-    Color getReflectionColor() const override { return reflection_color; };
-    Color getAmbientColor() const override { return ambient_color; };
-    Color getEmissiveColor() const override { return emissive_color; };
-    
-    double getDiffuseFactor() const override { return diffuse_factor; };
-    double getSpecularFactor() const override { return specular_factor; };
-    double getReflectionFactor() const override { return reflection_factor; };
-    double getShininess() const override { return shininess; };
-    double getShininessExponent() const override { return shininess_exponent; };
-    double getAmbientFactor() const override { return ambient_factor; };
-    double getBumpFactor() const override { return bump_factor; };
-    double getEmissiveFactor() const override { return emissive_factor; };
+	Color getReflectionColor() const override { return reflection_color; };
+	Color getAmbientColor() const override { return ambient_color; };
+	Color getEmissiveColor() const override { return emissive_color; };
+
+	double getDiffuseFactor() const override { return diffuse_factor; };
+	double getSpecularFactor() const override { return specular_factor; };
+	double getReflectionFactor() const override { return reflection_factor; };
+	double getShininess() const override { return shininess; };
+	double getShininessExponent() const override { return shininess_exponent; };
+	double getAmbientFactor() const override { return ambient_factor; };
+	double getBumpFactor() const override { return bump_factor; };
+	double getEmissiveFactor() const override { return emissive_factor; };
 
 	const Texture* textures[Texture::TextureType::COUNT];
 	Color diffuse_color;
 	Color specular_color;
-    Color reflection_color;
-    Color ambient_color;
-    Color emissive_color;
+	Color reflection_color;
+	Color ambient_color;
+	Color emissive_color;
 
-    double diffuse_factor;
-    double specular_factor;
-    double reflection_factor;
-    double shininess;
-    double shininess_exponent;
-    double ambient_factor;
-    double bump_factor;
-    double emissive_factor;
+	double diffuse_factor;
+	double specular_factor;
+	double reflection_factor;
+	double shininess;
+	double shininess_exponent;
+	double ambient_factor;
+	double bump_factor;
+	double emissive_factor;
  };
 
 
@@ -1165,81 +1461,29 @@ Geometry::Geometry(const Scene& _scene, const IElement& _element)
 }
 
 
-struct GeometryImpl : Geometry
-{
-	enum VertexDataMapping
-	{
-		BY_POLYGON_VERTEX,
-		BY_POLYGON,
-		BY_VERTEX
-	};
-
-	struct NewVertex
-	{
-		~NewVertex() { delete next; }
-
-		int index = -1;
-		NewVertex* next = nullptr;
-	};
-
-	std::vector<Vec3> vertices;
-	std::vector<Vec3> normals;
-	std::vector<Vec2> uvs[s_uvs_max];
-	std::vector<Vec4> colors;
-	std::vector<Vec3> tangents;
-	std::vector<int> materials;
-
-	const Skin* skin = nullptr;
-	const BlendShape* blendShape = nullptr;
-
-	std::vector<int> indices;
-	std::vector<NewVertex> to_new_vertices;
-
-	GeometryImpl(const Scene& _scene, const IElement& _element)
-		: Geometry(_scene, _element)
-	{
-	}
-
-
-	Type getType() const override { return Type::GEOMETRY; }
-	int getVertexCount() const override { return (int)vertices.size(); }
-	const int* getFaceIndices() const override { return indices.empty() ? nullptr : &indices[0]; }
-	int getIndexCount() const override { return (int)indices.size(); }
-	const Vec3* getVertices() const override { return &vertices[0]; }
-	const Vec3* getNormals() const override { return normals.empty() ? nullptr : &normals[0]; }
-	const Vec2* getUVs(int index = 0) const override { return index < 0 || index >= s_uvs_max || uvs[index].empty() ? nullptr : &uvs[index][0]; }
-	const Vec4* getColors() const override { return colors.empty() ? nullptr : &colors[0]; }
-	const Vec3* getTangents() const override { return tangents.empty() ? nullptr : &tangents[0]; }
-	const Skin* getSkin() const override { return skin; }
-	const BlendShape* getBlendShape() const override { return blendShape; }
-	const int* getMaterials() const override { return materials.empty() ? nullptr : &materials[0]; }
-};
-
-
 Shape::Shape(const Scene& _scene, const IElement& _element)
 	: Object(_scene, _element)
 {
 }
 
 
-struct ShapeImpl : Shape
-{
+struct ShapeImpl : Shape {
 	std::vector<Vec3> vertices;
 	std::vector<Vec3> normals;
+	std::vector<int> indices;
 
 	ShapeImpl(const Scene& _scene, const IElement& _element)
 		: Shape(_scene, _element)
-	{
-	}
-
-
-	bool postprocess(GeometryImpl* geom, Allocator& allocator);
+	{}
 
+	bool postprocess(GeometryImpl& geom, Allocator& allocator);
 
 	Type getType() const override { return Type::SHAPE; }
 	int getVertexCount() const override { return (int)vertices.size(); }
+	int getIndexCount() const override { return (int)indices.size(); }
 	const Vec3* getVertices() const override { return &vertices[0]; }
 	const Vec3* getNormals() const override { return normals.empty() ? nullptr : &normals[0]; }
+	const int* getIndices() const override { return indices.empty() ? nullptr : &indices[0]; }
 };
 
 
@@ -1260,50 +1504,30 @@ struct ClusterImpl : Cluster
 	int getIndicesCount() const override { return (int)indices.size(); }
 	const double* getWeights() const override { return &weights[0]; }
 	int getWeightsCount() const override { return (int)weights.size(); }
-	Matrix getTransformMatrix() const override { return transform_matrix; }
-	Matrix getTransformLinkMatrix() const override { return transform_link_matrix; }
+	DMatrix getTransformMatrix() const override { return transform_matrix; }
+	DMatrix getTransformLinkMatrix() const override { return transform_link_matrix; }
 	Object* getLink() const override { return link; }
 
-
-	bool postprocess(Allocator& allocator)
-	{
+	bool postprocess() {
 		assert(skin);
 
-		GeometryImpl* geom = (GeometryImpl*)skin->resolveObjectLinkReverse(Object::Type::GEOMETRY);
-		if (!geom) return false;
+		GeometryDataImpl* geom = static_cast<GeometryDataImpl*>(static_cast<GeometryImpl*>(skin->resolveObjectLinkReverse(Object::Type::GEOMETRY)));
+		if (!geom) {
+			MeshImpl* mesh = (MeshImpl*)skin->resolveObjectLinkReverse(Object::Type::MESH);
+			if(!mesh) return false;
+			geom = &mesh->geometry_data;
+		}
 
-		allocator.int_tmp.clear(); // old indices
 		const Element* indexes = findChild((const Element&)element, "Indexes");
 		if (indexes && indexes->first_property)
 		{
-			if (!parseBinaryArray(*indexes->first_property, &allocator.int_tmp)) return false;
+			if (!parseVecData(*indexes->first_property, &indices)) return false;
 		}
 
-		allocator.double_tmp.clear(); // old weights
 		const Element* weights_el = findChild((const Element&)element, "Weights");
 		if (weights_el && weights_el->first_property)
 		{
-			if (!parseBinaryArray(*weights_el->first_property, &allocator.double_tmp)) return false;
-		}
-
-		if (allocator.int_tmp.size() != allocator.double_tmp.size()) return false;
-
-		indices.reserve(allocator.int_tmp.size());
-		weights.reserve(allocator.int_tmp.size());
-		int* ir = allocator.int_tmp.empty() ? nullptr : &allocator.int_tmp[0];
-		double* wr = allocator.double_tmp.empty() ? nullptr : &allocator.double_tmp[0];
-		for (int i = 0, c = (int)allocator.int_tmp.size(); i < c; ++i)
-		{
-			int old_idx = ir[i];
-			double w = wr[i];
-			GeometryImpl::NewVertex* n = &geom->to_new_vertices[old_idx];
-			if (n->index == -1) continue; // skip vertices which aren't indexed.
-			while (n)
-			{
-				indices.push_back(n->index);
-				weights.push_back(w);
-				n = n->next;
-			}
+			if (!parseVecData(*weights_el->first_property, &weights)) return false;
 		}
 
 		return true;
@@ -1314,8 +1538,8 @@ struct ClusterImpl : Cluster
 	Skin* skin = nullptr;
 	std::vector<int> indices;
 	std::vector<double> weights;
-	Matrix transform_matrix;
-	Matrix transform_link_matrix;
+	DMatrix transform_matrix;
+	DMatrix transform_link_matrix;
 	Type getType() const override { return Type::CLUSTER; }
 };
 
@@ -1420,8 +1644,7 @@ struct BlendShapeChannelImpl : BlendShapeChannel
 
 	Type getType() const override { return Type::BLEND_SHAPE_CHANNEL; }
 
-	bool postprocess(Allocator& allocator)
-	{
+	bool postprocess(Allocator& allocator) {
 		assert(blendShape);
 
 		GeometryImpl* geom = (GeometryImpl*)blendShape->resolveObjectLinkReverse(Object::Type::GEOMETRY);
@@ -1436,13 +1659,13 @@ struct BlendShapeChannelImpl : BlendShapeChannel
 		const Element* full_weights_el = findChild((const Element&)element, "FullWeights");
 		if (full_weights_el && full_weights_el->first_property)
 		{
-			if (!parseBinaryArray(*full_weights_el->first_property, &fullWeights)) return false;
+			if (!parseVecData(*full_weights_el->first_property, &fullWeights)) return false;
 		}
 
-		for (int i = 0; i < shapes.size(); i++)
+		for (int i = 0; i < (int)shapes.size(); i++)
 		{
 			auto shape = (ShapeImpl*)shapes[i];
-			if (!shape->postprocess(geom, allocator)) return false;
+			if (!shape->postprocess(*geom, allocator)) return false;
 		}
 
 		return true;
@@ -1494,20 +1717,16 @@ struct PoseImpl : Pose
 {
 	PoseImpl(const Scene& _scene, const IElement& _element)
 		: Pose(_scene, _element)
-	{
-	}
+	{}
 
-	bool postprocess(Scene* scene);
-
-
-	Matrix getMatrix() const override { return matrix; }
+	bool postprocess(Scene& scene);
+	DMatrix getMatrix() const override { return matrix; }
 	const Object* getNode() const override { return node; }
-
 	Type getType() const override { return Type::POSE; }
 
-	Matrix matrix;
+	DMatrix matrix;
 	Object* node = nullptr;
-	DataView node_id;
+	u64 node_id;
 };
 
 
@@ -1528,6 +1747,163 @@ struct TextureImpl : Texture
 	Type getType() const override { return Type::TEXTURE; }
 };
 
+struct LightImpl : Light
+{
+	LightImpl(const Scene& _scene, const IElement& _element)
+		: Light(_scene, _element)
+	{
+	}
+
+	Type getType() const override { return Type::LIGHT; }
+	LightType getLightType() const override { return lightType; }
+
+	bool doesCastLight() const override { return castLight; }
+
+	bool doesDrawVolumetricLight() const override
+	{
+		// Return the draw volumetric light property based on the stored data (WIP)
+		return false;
+	}
+
+	bool doesDrawGroundProjection() const override
+	{
+		// Return the draw ground projection property based on the stored data (WIP)
+		return false;
+	}
+
+	bool doesDrawFrontFacingVolumetricLight() const override
+	{
+		// Return the draw front-facing volumetric light property based on the stored data (WIP)
+		return false;
+	}
+
+	Color getColor() const override { return color; }
+	double getIntensity() const override { return intensity; }
+	double getInnerAngle() const override { return innerAngle; }
+	double getOuterAngle() const override { return outerAngle; }
+
+	double getFog() const override { return fog; }
+
+	DecayType getDecayType() const override { return decayType; }
+	double getDecayStart() const override { return decayStart; }
+
+	// Near attenuation
+	bool doesEnableNearAttenuation() const override { return enableNearAttenuation; }
+	double getNearAttenuationStart() const override { return nearAttenuationStart; }
+	double getNearAttenuationEnd() const override { return nearAttenuationEnd; }
+
+	// Far attenuation
+	bool doesEnableFarAttenuation() const override { return enableFarAttenuation; }
+	double getFarAttenuationStart() const override { return farAttenuationStart; }
+	double getFarAttenuationEnd() const override { return farAttenuationEnd; }
+
+	// Shadows
+	const Texture* getShadowTexture() const override { return shadowTexture; }
+	bool doesCastShadows() const override { return castShadows; }
+	Color getShadowColor() const override { return shadowColor; }
+
+	// Member variables to store light properties
+	//-------------------------------------------------------------------------
+	LightType lightType = LightType::POINT;
+	bool castLight = true;
+	Color color = {1, 1, 1};					// Light color (RGB values)
+	double intensity = 100.0;
+
+	double innerAngle = 0.0;
+	double outerAngle = 45.0;
+
+	double fog = 50;
+
+	DecayType decayType = DecayType::QUADRATIC;
+	double decayStart = 1.0;
+
+	bool enableNearAttenuation = false;
+	double nearAttenuationStart = 0.0;
+	double nearAttenuationEnd = 0.0;
+
+	bool enableFarAttenuation = false;
+	double farAttenuationStart = 0.0;
+	double farAttenuationEnd = 0.0;
+
+	const Texture* shadowTexture = nullptr;
+	bool castShadows = true;
+	Color shadowColor = {0, 0, 0};
+};
+
+static float OFBX_PI = 3.14159265358979323846f;
+struct CameraImpl : public Camera
+{
+	CameraImpl(const Scene& _scene, const IElement& _element)
+		: Camera(_scene, _element)
+	{
+	}
+
+	ProjectionType projectionType = ProjectionType::PERSPECTIVE;
+	ApertureMode apertureMode = ApertureMode::HORIZONTAL; // Used to determine the FOV
+
+	double filmHeight = 36.0;
+	double filmWidth = 24.0;
+
+	double aspectHeight = 1.0;
+	double aspectWidth = 1.0;
+
+	double nearPlane = 0.1;
+	double farPlane = 1000.0;
+	bool autoComputeClipPanes = true;
+	
+	GateFit gateFit = GateFit::HORIZONTAL;
+	double filmAspectRatio = 1.0;
+	double focalLength = 50.0;
+	double focusDistance = 50.0;
+	
+	DVec3 backgroundColor = {0, 0, 0};
+	DVec3 interestPosition = {0, 0, 0};
+
+	double fieldOfView = 60.0;
+
+	Type getType() const override { return Type::CAMERA; }
+	ProjectionType getProjectionType() const override { return projectionType; }
+	ApertureMode getApertureMode() const override { return apertureMode; }
+
+	double getFilmHeight() const override { return filmHeight; }
+	double getFilmWidth() const override { return filmWidth; }
+
+	double getAspectHeight() const override { return aspectHeight; }
+	double getAspectWidth() const override { return aspectWidth; }
+
+	double getNearPlane() const override { return nearPlane; }
+	double getFarPlane() const override { return farPlane; }
+	bool doesAutoComputeClipPanes() const override { return autoComputeClipPanes; }
+
+	GateFit getGateFit() const override { return gateFit; }
+	double getFilmAspectRatio() const override { return filmAspectRatio; }
+	double getFocalLength() const override { return focalLength; }
+	double getFocusDistance() const override { return focusDistance; }
+
+	DVec3 getBackgroundColor() const override { return backgroundColor; }
+	DVec3 getInterestPosition() const override { return interestPosition; }
+
+	void CalculateFOV()
+	{
+		switch (apertureMode)
+		{
+			case Camera::ApertureMode::HORIZONTAL:
+				fieldOfView =  2.0 * atan(filmWidth / (2.0 * focalLength)) * 180.0 / OFBX_PI;
+				return;
+			case Camera::ApertureMode::VERTICAL:
+				fieldOfView =  2.0 * atan(filmHeight / (2.0 * focalLength)) * 180.0 / OFBX_PI;
+				return;
+			case Camera::ApertureMode::HORIZANDVERT:
+				fieldOfView =  2.0 * atan(sqrt(filmWidth * filmWidth + filmHeight * filmHeight) / (2.0 * focalLength)) * 180.0 / OFBX_PI;
+				return;
+			case Camera::ApertureMode::FOCALLENGTH:
+				fieldOfView =  2.0 * atan(filmHeight / (2.0 * focalLength)) * 180.0 / OFBX_PI; // Same as vertical ¯\_(ツ)_/¯
+				return;
+			default:
+				fieldOfView =  60.0;
+		}
+	}
+};
 
 struct Root : Object
 {
@@ -1548,13 +1924,16 @@ struct Scene : IScene
 		enum Type
 		{
 			OBJECT_OBJECT,
-			OBJECT_PROPERTY
+			OBJECT_PROPERTY,
+			PROPERTY_OBJECT,
+			PROPERTY_PROPERTY,
 		};
 
 		Type type = OBJECT_OBJECT;
-		u64 from = 0;
-		u64 to = 0;
-		DataView property;
+		u64 from_object = 0;
+		u64 to_object = 0;
+		DataView from_property;
+		DataView to_property;
 	};
 
 	struct ObjectPair
@@ -1565,9 +1944,10 @@ struct Scene : IScene
 
 
 	int getAnimationStackCount() const override { return (int)m_animation_stacks.size(); }
+	int getGeometryCount() const override { return (int)m_geometries.size(); }
 	int getMeshCount() const override { return (int)m_meshes.size(); }
 	float getSceneFrameRate() const override { return m_scene_frame_rate; }
-    const GlobalInfo* getGlobalInfo() const override { return &m_info; }
+	const GlobalInfo* getGlobalInfo() const override { return &m_info; }
 	const GlobalSettings* getGlobalSettings() const override { return &m_settings; }
 
 	const Object* const* getAllObjects() const override { return m_all_objects.empty() ? nullptr : &m_all_objects[0]; }
@@ -1583,6 +1963,14 @@ struct Scene : IScene
 		return m_videos[index].content;
 	}
 
+	bool isEmbeddedBase64(int index) const override {
+		return m_videos[index].is_base_64;
+	}
+
+	const IElementProperty* getEmbeddedBase64Data(int index) const override {
+		return m_videos[index].base64_property;
+	}
+
 	DataView getEmbeddedFilename(int index) const override {
 		return m_videos[index].filename;
 	}
@@ -1603,6 +1991,14 @@ struct Scene : IScene
 	}
 
 
+	const Geometry* getGeometry(int index) const override
+	{
+		assert(index >= 0);
+		assert(index < m_geometries.size());
+		return m_geometries[index];
+	}
+
+
 	const TakeInfo* getTakeInfo(const char* name) const override
 	{
 		for (const TakeInfo& info : m_take_infos)
@@ -1612,6 +2008,30 @@ struct Scene : IScene
 		return nullptr;
 	}
 
+	const Camera* getCamera(int index) const override
+	{
+		assert(index >= 0);
+		assert(index < m_cameras.size());
+		return m_cameras[index];
+	}
+
+	int getCameraCount() const override
+	{
+		return (int)m_cameras.size();
+	}
+
+	const Light* getLight(int index) const override
+	{
+		assert(index >= 0);
+		assert(index < m_lights.size());
+		return m_lights[index];
+	}
+
+	int getLightCount() const override
+	{
+		return (int)m_lights.size();
+	}
+
 
 	const IElement* getRootElement() const override { return m_root_element; }
 	const Object* getRoot() const override { return m_root; }
@@ -1620,29 +2040,55 @@ struct Scene : IScene
 	void destroy() override { delete this; }
 
 
-	~Scene() override
-	{
-	    for(auto ptr : m_all_objects)
-		    ptr->~Object();
+	~Scene() override {
+		for(Object* ptr : m_all_objects) {
+			ptr->~Object();
+		}
 	}
 
+	bool finalize();
 
 	Element* m_root_element = nullptr;
 	Root* m_root = nullptr;
 	float m_scene_frame_rate = -1;
 	GlobalInfo m_info;
 	GlobalSettings m_settings;
+
+	std::unordered_map<std::string, u64> m_fake_ids;
 	std::unordered_map<u64, ObjectPair> m_object_map;
 	std::vector<Object*> m_all_objects;
 	std::vector<Mesh*> m_meshes;
+	std::vector<Geometry*> m_geometries;
 	std::vector<AnimationStack*> m_animation_stacks;
+	std::vector<Camera*> m_cameras;
+	std::vector<Light*> m_lights;
 	std::vector<Connection> m_connections;
 	std::vector<u8> m_data;
 	std::vector<TakeInfo> m_take_infos;
 	std::vector<Video> m_videos;
 	Allocator m_allocator;
+	u32 version = 0;
 };
 
+Object::Object(const Scene& scene, const IElement& element)
+	: scene(scene)
+	, element(element)
+	, is_node(false)
+	, node_attribute(nullptr)
+{
+	Element& e = (Element&)element;
+	if (scene.version < 6200 && e.first_property && isString(e.first_property)) {
+		e.first_property->value.toString(name);
+	}
+	else if (e.first_property && e.first_property->next)
+	{
+		e.first_property->next->value.toString(name);
+	}
+	else
+	{
+		name[0] = '\0';
+	}
+}
 
 DataView TextureImpl::getEmbeddedData() const {
 	if (!media.begin) return media;
@@ -1657,9 +2103,8 @@ DataView TextureImpl::getEmbeddedData() const {
 }
 
 
-bool PoseImpl::postprocess(Scene* scene)
-{
-	node = scene->m_object_map[node_id.toU64()].object;
+bool PoseImpl::postprocess(Scene& scene) {
+	node = scene.m_object_map[node_id].object;
 	if (node && node->getType() == Object::Type::MESH) {
 		static_cast<MeshImpl*>(node)->pose = this;
 	}
@@ -1673,21 +2118,22 @@ struct AnimationCurveNodeImpl : AnimationCurveNode
 		: AnimationCurveNode(_scene, _element)
 	{
 		default_values[0] = default_values[1] = default_values[2] =  0;
-		Element* dx = static_cast<Element*>(resolveProperty(*this, "d|X"));
-		Element* dy = static_cast<Element*>(resolveProperty(*this, "d|Y"));
-		Element* dz = static_cast<Element*>(resolveProperty(*this, "d|Z"));
+		bool is_p60;
+		Element* dx = static_cast<Element*>(resolveProperty(*this, "d|X", &is_p60));
+		Element* dy = static_cast<Element*>(resolveProperty(*this, "d|Y", &is_p60));
+		Element* dz = static_cast<Element*>(resolveProperty(*this, "d|Z", &is_p60));
 
 		if (dx) {
 			Property* x = (Property*)dx->getProperty(4);
-			if (x) default_values[0] = (float)x->value.toDouble();	
+			if (x) default_values[0] = (float)x->value.toDouble();
 		}
 		if (dy) {
 			Property* y = (Property*)dy->getProperty(4);
-			if (y) default_values[1] = (float)y->value.toDouble();	
+			if (y) default_values[1] = (float)y->value.toDouble();
 		}
 		if (dz) {
 			Property* z = (Property*)dz->getProperty(4);
-			if (z) default_values[2] = (float)z->value.toDouble();	
+			if (z) default_values[2] = (float)z->value.toDouble();
 		}
 	}
 
@@ -1697,6 +2143,7 @@ struct AnimationCurveNodeImpl : AnimationCurveNode
 		return bone;
 	}
 
+	DataView getBoneLinkProperty() const override { return bone_link_property; }
 
 	const AnimationCurve* getCurve(int idx) const override {
 		assert(idx >= 0 && idx < 3);
@@ -1704,7 +2151,7 @@ struct AnimationCurveNodeImpl : AnimationCurveNode
 	}
 
 
-	Vec3 getNodeLocalTransform(double time) const override
+	DVec3 getNodeLocalTransform(double time) const override
 	{
 		i64 fbx_time = secondsToFbxTime(time);
 
@@ -1766,7 +2213,7 @@ struct AnimationLayerImpl : AnimationLayer
 
 	const AnimationCurveNode* getCurveNode(int index) const override
 	{
-		if (index >= curve_nodes.size() || index < 0) return nullptr;
+		if (index >= (int)curve_nodes.size() || index < 0) return nullptr;
 		return curve_nodes[index];
 	}
 
@@ -1784,30 +2231,151 @@ struct AnimationLayerImpl : AnimationLayer
 	std::vector<AnimationCurveNodeImpl*> curve_nodes;
 };
 
+/*
+	DEBUGGING ONLY (but im not your boss so do what you want)
+	- maps the contents of the given node for viewing in the debugger
+	
+	std::map<std::string, ofbx::IElementProperty*, std::less<>> allProperties;
+	mapProperties(element, allProperties);
+*/
+void mapProperties(const ofbx::IElement& parent, std::map<std::string, ofbx::IElementProperty*, std::less<std::string>>& propMap)
+{
+	for (const ofbx::IElement* element = parent.getFirstChild(); element; element = element->getSibling())
+	{
+		char key[32];
+
+		if (element->getFirstProperty())
+			element->getFirstProperty()->getValue().toString(key);
+		else
+			element->getID().toString(key);
+
+
+		ofbx::IElementProperty* prop = element->getFirstProperty();
+		propMap.insert({key, prop});
+
+		if (element->getFirstChild()) mapProperties(*element, propMap);
+	}
+};
+
+
 void parseVideo(Scene& scene, const Element& element, Allocator& allocator)
 {
 	if (!element.first_property) return;
 	if (!element.first_property->next) return;
 	if (element.first_property->next->getType() != IElementProperty::STRING) return;
-	
+
 	const Element* content_element = findChild(element, "Content");
 
+	bool is_base64 = false;
 	if (!content_element) return;
 	if (!content_element->first_property) return;
-	if (content_element->first_property->getType() != IElementProperty::BINARY) return;
+	const Property* content_property = content_element->first_property;
+	if (content_element->first_property->getType() != IElementProperty::BINARY) {
+		is_base64 = true;
+	}
 
 	const Element* filename_element = findChild(element, "Filename");
 	if (!filename_element) return;
 	if (!filename_element->first_property) return;
 	if (filename_element->first_property->getType() != IElementProperty::STRING) return;
-	
+
 	Video video;
-	video.content = content_element->first_property->value;
+	video.is_base_64 = is_base64;
+	video.base64_property = is_base64 ? content_element->first_property->next : nullptr;
+	video.content = is_base64 ? DataView{} : content_element->first_property->value;
 	video.filename = filename_element->first_property->value;
 	video.media = element.first_property->next->value;
 	scene.m_videos.push_back(video);
 }
 
+static bool parseGeometryMaterials(GeometryDataImpl& geom, const Element& element, std::vector<ParseDataJob> &jobs)
+{
+	const Element* layer_material_element = findChild(element, "LayerElementMaterial");
+	if (!layer_material_element) return true;
+
+	const Element* mapping_element = findChild(*layer_material_element, "MappingInformationType");
+	const Element* reference_element = findChild(*layer_material_element, "ReferenceInformationType");
+
+	if (!mapping_element || !reference_element) return false;
+	if (!mapping_element->first_property) return false;
+	if (!reference_element->first_property) return false;
+
+	if (mapping_element->first_property->value == "ByPolygon" && reference_element->first_property->value == "IndexToDirect") {
+		const Element* indices_element = findChild(*layer_material_element, "Materials");
+		if (!indices_element || !indices_element->first_property) return false;
+
+		return pushJob(jobs, *indices_element->first_property, geom.materials);
+	}
+	else
+	{
+		if (mapping_element->first_property->value != "AllSame") return false;
+	}
+	return true;
+}
+
+static bool parseGeometryUVs(GeometryDataImpl& geom, const Element& element, std::vector<ParseDataJob> &jobs) {
+	const Element* layer_uv_element = findChild(element, "LayerElementUV");
+	while (layer_uv_element) {
+		const int uv_index = layer_uv_element->first_property ? layer_uv_element->first_property->getValue().toInt() : 0;
+		if (uv_index >= 0 && uv_index < Geometry::s_uvs_max) {
+			Vec2AttributesImpl& uvs = geom.uvs[uv_index];
+			if (!parseVertexData(*layer_uv_element, "UV", "UVIndex", uvs, jobs)) return false;
+		}
+
+		do {
+			layer_uv_element = layer_uv_element->sibling;
+		} while (layer_uv_element && layer_uv_element->id != "LayerElementUV");
+	}
+	return true;
+}
+
+static bool parseGeometryTangents(GeometryDataImpl& geom, const Element& element, std::vector<ParseDataJob> &jobs) {
+	const Element* layer_tangent_element = findChild(element, "LayerElementTangents");
+	if (!layer_tangent_element) layer_tangent_element = findChild(element, "LayerElementTangent");
+	if (!layer_tangent_element) return true;
+
+	if (findChild(*layer_tangent_element, "Tangents")) {
+		return parseVertexData(*layer_tangent_element, "Tangents", "TangentsIndex", geom.tangents, jobs);
+	}
+
+	return parseVertexData(*layer_tangent_element, "Tangent", "TangentIndex", geom.tangents, jobs);
+}
+
+static bool parseGeometryColors(GeometryDataImpl& geom, const Element& element, std::vector<ParseDataJob> &jobs) {
+	const Element* layer_color_element = findChild(element, "LayerElementColor");
+	if (!layer_color_element) return true;
+	return parseVertexData(*layer_color_element, "Colors", "ColorIndex", geom.colors, jobs);
+}
+
+static bool parseGeometryNormals(GeometryDataImpl& geom, const Element& element, std::vector<ParseDataJob> &jobs) {
+	const Element* layer_normal_element = findChild(element, "LayerElementNormal");
+	if (!layer_normal_element) return true;
+	return parseVertexData(*layer_normal_element, "Normals", "NormalsIndex", geom.normals, jobs);
+}
+
+struct OptionalError<Object*> parseMesh(const Scene& scene, const Element& element, std::vector<ParseDataJob> &jobs, Allocator& allocator) {
+	MeshImpl* mesh = allocator.allocate<MeshImpl>(scene, element);
+
+	if (!element.first_property) return Error("Invalid mesh");
+
+	const Element* vertices_element = findChild(element, "Vertices");
+	if (!vertices_element || !vertices_element->first_property) return mesh;
+
+	const Element* polys_element = findChild(element, "PolygonVertexIndex");
+	if (!polys_element || !polys_element->first_property) return Error("Indices missing");
+
+	if (!pushJob(jobs, *vertices_element->first_property, mesh->geometry_data.positions.values)) return Error("Invalid vertices");
+	if (!pushJob(jobs, *polys_element->first_property, mesh->geometry_data.positions.indices)) return Error("Invalid vertices");
+
+	if (!parseGeometryMaterials(mesh->geometry_data, element, jobs)) return Error("Invalid materials");
+	if (!parseGeometryUVs(mesh->geometry_data, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryTangents(mesh->geometry_data, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryColors(mesh->geometry_data, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryNormals(mesh->geometry_data, element, jobs)) return Error("Invalid vertex attributes");
+
+	return mesh;
+}
+
 struct OptionalError<Object*> parseTexture(const Scene& scene, const Element& element, Allocator& allocator)
 {
 	TextureImpl* texture = allocator.allocate<TextureImpl>(scene, element);
@@ -1831,8 +2399,165 @@ struct OptionalError<Object*> parseTexture(const Scene& scene, const Element& el
 	return texture;
 }
 
+struct OptionalError<Object*> parseLight(Scene& scene, const Element& element, Allocator& allocator)
+{
+	LightImpl* light = allocator.allocate<LightImpl>(scene, element);
 
-struct OptionalError<Object*> parsePose(const Scene& scene, const Element& element, Allocator& allocator)
+	light->lightType = static_cast<Light::LightType>(resolveEnumProperty(*light, "LightType", (int)Light::LightType::POINT));
+
+	const Element* prop = findChild(element, "Properties70");
+	if (prop) prop = prop->child;
+
+	while (prop)
+	{
+		if (prop->id == "P" && prop->first_property)
+		{
+			if (prop->first_property->value == "Color")
+			{
+				light->color.r = (float)prop->getProperty(4)->getValue().toDouble();
+				light->color.g = (float)prop->getProperty(5)->getValue().toDouble();
+				light->color.b = (float)prop->getProperty(6)->getValue().toDouble();
+			}
+			if (prop->first_property->value == "ShadowColor")
+			{
+				light->shadowColor.r = (float)prop->getProperty(4)->getValue().toDouble();
+				light->shadowColor.g = (float)prop->getProperty(5)->getValue().toDouble();
+				light->shadowColor.b = (float)prop->getProperty(6)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "CastShadows")
+			{
+				light->castShadows = prop->getProperty(4)->getValue().toBool();
+			}
+			else if (prop->first_property->value == "InnerAngle")
+			{
+				light->innerAngle = (float)prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "OuterAngle")
+			{
+				light->outerAngle = (float)prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "Intensity")
+			{
+				light->intensity = (float)prop->getProperty(4)->getValue().toDouble();
+			}
+		}
+		prop = prop->sibling;
+	}
+
+	scene.m_lights.push_back(light);
+	return light;
+}
+
+struct OptionalError<Object*> parseCamera(Scene& scene, const Element& element, Allocator& allocator)
+{
+	CameraImpl* camera = allocator.allocate<CameraImpl>(scene, element);
+
+	camera->projectionType = static_cast<Camera::ProjectionType>(resolveEnumProperty(*camera, "ProjectionType", (int)Camera::ProjectionType::PERSPECTIVE));
+	camera->apertureMode = static_cast<Camera::ApertureMode>(resolveEnumProperty(*camera, "ApertureMode", (int)Camera::ApertureMode::HORIZANDVERT));
+	camera->gateFit = static_cast<Camera::GateFit>(resolveEnumProperty(*camera, "GateFit", (int)Camera::GateFit::HORIZONTAL));
+
+	const Element* prop = findChild(element, "Properties70");
+	if (prop) prop = prop->child;
+
+	while (prop)
+	{
+		if (prop->id == "P" && prop->first_property)
+		{
+			if (prop->first_property->value == "InterestPosition")
+			{
+				camera->interestPosition.x = (float)prop->getProperty(4)->getValue().toDouble();
+				camera->interestPosition.y = (float)prop->getProperty(5)->getValue().toDouble();
+				camera->interestPosition.z = (float)prop->getProperty(6)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "BackgroundColor")
+			{
+				camera->backgroundColor.x = (float)prop->getProperty(4)->getValue().toDouble();
+				camera->backgroundColor.y = (float)prop->getProperty(5)->getValue().toDouble();
+				camera->backgroundColor.z = (float)prop->getProperty(6)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FocalLength")
+			{
+				camera->focalLength = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FocusDistance")
+			{
+				camera->focusDistance = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FilmAspectRatio")
+			{
+				camera->filmAspectRatio = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FilmWidth")
+			{
+				camera->filmWidth = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FilmHeight")
+			{
+				camera->filmHeight = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "AspectHeight")
+			{
+				camera->aspectHeight = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "AspectWidth")
+			{
+				camera->aspectWidth = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "AutoComputeClipPanes")
+			{
+				camera->autoComputeClipPanes = prop->getProperty(4)->getValue().toBool();
+			}
+			else if (prop->first_property->value == "NearPlane")
+			{
+				camera->nearPlane = prop->getProperty(4)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "FarPlane")
+			{
+				camera->farPlane = prop->getProperty(4)->getValue().toDouble();
+			}
+		}
+		prop = prop->sibling;
+	}
+
+	camera->CalculateFOV();
+	scene.m_cameras.push_back(camera);
+	return camera;
+}
+
+static bool toObjectID(const Scene& scene, const Property* property, u64* out) {
+	if (!property) return false;
+	if (isString(property)) {
+		if (property->value == "Scene") return 0;
+		std::string tmp((const char*)property->value.begin, property->value.end - property->value.begin);
+		auto iter = scene.m_fake_ids.find(tmp);
+		if (iter != scene.m_fake_ids.end()) {
+			*out = iter->second;
+			return true;
+		}
+
+		return false;
+	}
+
+	*out = property->value.toU64();
+	return true;
+}
+
+
+static u64 toObjectID(Scene& scene, const Property* property) {
+	if (isString(property)) {
+		if (property->value == "Scene") return 0;
+		std::string tmp((const char*)property->value.begin, property->value.end - property->value.begin);
+		auto iter = scene.m_fake_ids.find(tmp);
+		if (iter != scene.m_fake_ids.end()) return iter->second;
+
+		scene.m_fake_ids.emplace(std::move(tmp), scene.m_fake_ids.size() + 1); // ID 0 is reserved for root
+		return scene.m_fake_ids.size();
+	}
+
+	return property->value.toU64();
+}
+
+struct OptionalError<Object*> parsePose(Scene& scene, const Element& element, Allocator& allocator)
 {
 	PoseImpl* pose = allocator.allocate<PoseImpl>(scene, element);
 	const Element* pose_node = findChild(element, "PoseNode");
@@ -1840,23 +2565,16 @@ struct OptionalError<Object*> parsePose(const Scene& scene, const Element& eleme
 		const Element* node = findChild(*pose_node, "Node");
 		const Element* matrix = findChild(*pose_node, "Matrix");
 
-		if (matrix->first_property) {
-			parseArrayRaw(*matrix->first_property, &pose->matrix, sizeof(pose->matrix));
+		if (matrix && matrix->first_property) {
+			if (!matrix->first_property->getValues(&pose->matrix.m[0], sizeof(pose->matrix))) {
+				return Error("Failed to parse pose");
+			}
 		}
-		pose->node_id = node->first_property->value;
+		pose->node_id = toObjectID(scene, node->first_property);
 	}
 	return pose;
 }
 
-
-template <typename T>
-static OptionalError<Object*> parse(const Scene& scene, const Element& element, Allocator& allocator)
-{
-	T* obj = allocator.allocate<T>(scene, element);
-	return obj;
-}
-
-
 static OptionalError<Object*> parseCluster(const Scene& scene, const Element& element, Allocator& allocator)
 {
 	ClusterImpl* obj = allocator.allocate<ClusterImpl>(scene, element);
@@ -1864,8 +2582,7 @@ static OptionalError<Object*> parseCluster(const Scene& scene, const Element& el
 	const Element* transform_link = findChild(element, "TransformLink");
 	if (transform_link && transform_link->first_property)
 	{
-		if (!parseArrayRaw(
-				*transform_link->first_property, &obj->transform_link_matrix, sizeof(obj->transform_link_matrix)))
+		if (!transform_link->first_property->getValues(&obj->transform_link_matrix.m[0], sizeof(obj->transform_link_matrix)))
 		{
 			return Error("Failed to parse TransformLink");
 		}
@@ -1873,7 +2590,7 @@ static OptionalError<Object*> parseCluster(const Scene& scene, const Element& el
 	const Element* transform = findChild(element, "Transform");
 	if (transform && transform->first_property)
 	{
-		if (!parseArrayRaw(*transform->first_property, &obj->transform_matrix, sizeof(obj->transform_matrix)))
+		if (!transform->first_property->getValues(&obj->transform_matrix.m[0], sizeof(obj->transform_matrix)))
 		{
 			return Error("Failed to parse Transform");
 		}
@@ -1895,159 +2612,91 @@ static OptionalError<Object*> parseNodeAttribute(const Scene& scene, const Eleme
 }
 
 
-static OptionalError<Object*> parseLimbNode(const Scene& scene, const Element& element, Allocator& allocator)
-{
-	if (!element.first_property
-		|| !element.first_property->next
-		|| !element.first_property->next->next
-		|| element.first_property->next->next->value != "LimbNode")
-	{
-		return Error("Invalid limb node");
-	}
-
-	LimbNodeImpl* obj = allocator.allocate<LimbNodeImpl>(scene, element);
-	return obj;
-}
-
-
-static OptionalError<Object*> parseMesh(const Scene& scene, const Element& element, Allocator& allocator)
-{
-	if (!element.first_property
-		|| !element.first_property->next
-		|| !element.first_property->next->next
-		|| element.first_property->next->next->value != "Mesh")
-	{
-		return Error("Invalid mesh");
-	}
-
-	return allocator.allocate<MeshImpl>(scene, element);
-}
-
-
 static OptionalError<Object*> parseMaterial(const Scene& scene, const Element& element, Allocator& allocator)
 {
 	MaterialImpl* material = allocator.allocate<MaterialImpl>(scene, element);
+	const char* property_id = "P";
+	int property_offset = 4;
 	const Element* prop = findChild(element, "Properties70");
+	if (!prop) {
+		property_id = "Property";
+		property_offset = 3;
+		prop = findChild(element, "Properties60");
+	}
 	material->diffuse_color = {1, 1, 1};
 	if (prop) prop = prop->child;
 	while (prop)
 	{
-		if (prop->id == "P" && prop->first_property)
+		if (prop->id == property_id && prop->first_property)
 		{
 			if (prop->first_property->value == "DiffuseColor")
 			{
-				material->diffuse_color.r = (float)prop->getProperty(4)->getValue().toDouble();
-				material->diffuse_color.g = (float)prop->getProperty(5)->getValue().toDouble();
-				material->diffuse_color.b = (float)prop->getProperty(6)->getValue().toDouble();
+				material->diffuse_color.r = (float)prop->getProperty(property_offset + 0)->getValue().toDouble();
+				material->diffuse_color.g = (float)prop->getProperty(property_offset + 1)->getValue().toDouble();
+				material->diffuse_color.b = (float)prop->getProperty(property_offset + 2)->getValue().toDouble();
 			}
 			else if (prop->first_property->value == "SpecularColor")
 			{
-				material->specular_color.r = (float)prop->getProperty(4)->getValue().toDouble();
-				material->specular_color.g = (float)prop->getProperty(5)->getValue().toDouble();
-				material->specular_color.b = (float)prop->getProperty(6)->getValue().toDouble();
+				material->specular_color.r = (float)prop->getProperty(property_offset + 0)->getValue().toDouble();
+				material->specular_color.g = (float)prop->getProperty(property_offset + 1)->getValue().toDouble();
+				material->specular_color.b = (float)prop->getProperty(property_offset + 2)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "Shininess")
+			{
+				material->shininess = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "ShininessExponent")
+			{
+				material->shininess_exponent = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "ReflectionColor")
+			{
+				material->reflection_color.r = (float)prop->getProperty(property_offset + 0)->getValue().toDouble();
+				material->reflection_color.g = (float)prop->getProperty(property_offset + 1)->getValue().toDouble();
+				material->reflection_color.b = (float)prop->getProperty(property_offset + 2)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "AmbientColor")
+			{
+				material->ambient_color.r = (float)prop->getProperty(property_offset + 0)->getValue().toDouble();
+				material->ambient_color.g = (float)prop->getProperty(property_offset + 1)->getValue().toDouble();
+				material->ambient_color.b = (float)prop->getProperty(property_offset + 2)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "EmissiveColor")
+			{
+				material->emissive_color.r = (float)prop->getProperty(property_offset + 0)->getValue().toDouble();
+				material->emissive_color.g = (float)prop->getProperty(property_offset + 1)->getValue().toDouble();
+				material->emissive_color.b = (float)prop->getProperty(property_offset + 2)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "ReflectionFactor")
+			{
+				material->reflection_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "BumpFactor")
+			{
+				material->bump_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "AmbientFactor")
+			{
+				material->ambient_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "DiffuseFactor")
+			{
+				material->diffuse_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "SpecularFactor")
+			{
+				material->specular_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
+			}
+			else if (prop->first_property->value == "EmissiveFactor")
+			{
+				material->emissive_factor = (float)prop->getProperty(property_offset)->getValue().toDouble();
 			}
-            else if (prop->first_property->value == "Shininess")
-            {
-                material->shininess = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "ShininessExponent")
-            {
-                material->shininess_exponent = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "ReflectionColor")
-            {
-                material->reflection_color.r = (float)prop->getProperty(4)->getValue().toDouble();
-                material->reflection_color.g = (float)prop->getProperty(5)->getValue().toDouble();
-                material->reflection_color.b = (float)prop->getProperty(6)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "AmbientColor")
-            {
-                material->ambient_color.r = (float)prop->getProperty(4)->getValue().toDouble();
-                material->ambient_color.g = (float)prop->getProperty(5)->getValue().toDouble();
-                material->ambient_color.b = (float)prop->getProperty(6)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "EmissiveColor")
-            {
-                material->emissive_color.r = (float)prop->getProperty(4)->getValue().toDouble();
-                material->emissive_color.g = (float)prop->getProperty(5)->getValue().toDouble();
-                material->emissive_color.b = (float)prop->getProperty(6)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "ReflectionFactor")
-            {
-                material->reflection_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "BumpFactor")
-            {
-                material->bump_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "AmbientFactor")
-            {
-                material->ambient_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "DiffuseFactor")
-            {
-                material->diffuse_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "SpecularFactor")
-            {
-                material->specular_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
-            else if (prop->first_property->value == "EmissiveFactor")
-            {
-                material->emissive_factor = (float)prop->getProperty(4)->getValue().toDouble();
-            }
 		}
 		prop = prop->sibling;
 	}
 	return material;
 }
 
-
-template <typename T> static bool parseTextArrayRaw(const Property& property, T* out, int max_size);
-
-template <typename T> static bool parseArrayRaw(const Property& property, T* out, int max_size)
-{
-	if (property.value.is_binary)
-	{
-		assert(out);
-
-		int elem_size = 1;
-		switch (property.type)
-		{
-			case 'l': elem_size = 8; break;
-			case 'd': elem_size = 8; break;
-			case 'f': elem_size = 4; break;
-			case 'i': elem_size = 4; break;
-			default: return false;
-		}
-
-		const u8* data = property.value.begin + sizeof(u32) * 3;
-		if (data > property.value.end) return false;
-
-		u32 count = property.getCount();
-		u32 enc = *(const u32*)(property.value.begin + 4);
-		u32 len = *(const u32*)(property.value.begin + 8);
-
-		if (enc == 0)
-		{
-			if ((int)len > max_size) return false;
-			if (data + len > property.value.end) return false;
-			memcpy(out, data, len);
-			return true;
-		}
-		else if (enc == 1)
-		{
-			if (int(elem_size * count) > max_size) return false;
-			return decompress(data, len, (u8*)out, elem_size * count);
-		}
-
-		return false;
-	}
-
-	return parseTextArrayRaw(property, out, max_size);
-}
-
-
 template <typename T> const char* fromString(const char* str, const char* end, T* val);
 template <> const char* fromString<int>(const char* str, const char* end, int* val)
 {
@@ -2114,36 +2763,177 @@ const char* fromString(const char* str, const char* end, double* val, int count)
 	return (const char*)iter;
 }
 
+const char* fromString(const char* str, const char* end, float* val, int count)
+{
+	const char* iter = str;
+	for (int i = 0; i < count; ++i)
+	{
+		*val = (float)atof(iter);
+		++val;
+		while (iter < end && *iter != ',') ++iter;
+		if (iter < end) ++iter; // skip ','
 
-template <> const char* fromString<Vec2>(const char* str, const char* end, Vec2* val)
+		if (iter == end) return iter;
+	}
+	return (const char*)iter;
+}
+
+template <> const char* fromString<DVec2>(const char* str, const char* end, DVec2* val)
 {
 	return fromString(str, end, &val->x, 2);
 }
 
+template <> const char* fromString<FVec2>(const char* str, const char* end, FVec2* val)
+{
+	return fromString(str, end, &val->x, 2);
+}
 
-template <> const char* fromString<Vec3>(const char* str, const char* end, Vec3* val)
+template <> const char* fromString<FVec3>(const char* str, const char* end, FVec3* val)
 {
 	return fromString(str, end, &val->x, 3);
 }
 
+template <> const char* fromString<DVec3>(const char* str, const char* end, DVec3* val)
+{
+	return fromString(str, end, &val->x, 3);
+}
 
-template <> const char* fromString<Vec4>(const char* str, const char* end, Vec4* val)
+template <> const char* fromString<DVec4>(const char* str, const char* end, DVec4* val)
 {
 	return fromString(str, end, &val->x, 4);
 }
 
+template <> const char* fromString<FVec4>(const char* str, const char* end, FVec4* val)
+{
+	return fromString(str, end, &val->x, 4);
+}
 
-template <> const char* fromString<Matrix>(const char* str, const char* end, Matrix* val)
+template <> const char* fromString<DMatrix>(const char* str, const char* end, DMatrix* val)
 {
 	return fromString(str, end, &val->m[0], 16);
 }
 
 
+template <typename T> static bool parseMemoryText(const Property& property, T* out, int max_size_bytes) {
+	const u8* iter = property.value.begin;
+	int max_count = max_size_bytes / sizeof(T);
+	int count = 0;
+	while (iter < property.value.end) {
+		if (count >= max_count) return false;
+		T val;
+		iter = (const u8*)fromString<T>((const char*)iter, (const char*)property.value.end, &val);
+		out[count] = val;
+		++count;
+	}
+	return true;
+}
+
+template <typename T> static bool parseMemoryLinked(const Property& property, T* out, int max_size_bytes) {
+	assert(out);
+	assert(property.value.is_binary);
+
+	int elem_size = 1;
+	switch (property.type) {
+		case 'L': elem_size = 8; break;
+		case 'D': elem_size = 8; break;
+		case 'F': elem_size = 4; break;
+		case 'I': elem_size = 4; break;
+		default: return false;
+	}
+
+	if (sizeof(T) % elem_size != 0) return false;
+
+	const Property* p = &property;
+	int count = 0;
+	int max_count = max_size_bytes / sizeof(T);
+	while (p) {
+		T tmp;
+		if (count == max_count) return false;
+		for (u32 i = 0; i < sizeof(T) / elem_size; ++i) {
+			if (!p) return false;
+			if (p->type != property.type) return false;
+
+			memcpy((u8*)&tmp + elem_size * i, p->value.begin, elem_size);
+			p = p->next;
+		}
+		out[count] = tmp;
+		++count;
+	}
+
+	return true;
+}
+
+template <typename T> struct TElemType;
+template <> struct TElemType<float> { using Type = float; };
+template <> struct TElemType<double> { using Type = double; };
+template <> struct TElemType<int> { using Type = int; };
+template <> struct TElemType<i64> { using Type = i64; };
+template <> struct TElemType<u64> { using Type = u64; };
+template <> struct TElemType<DVec2> { using Type = double; };
+template <> struct TElemType<DVec3> { using Type = double; };
+template <> struct TElemType<DVec4> { using Type = double; };
+template <> struct TElemType<FVec2> { using Type = float; };
+template <> struct TElemType<FVec3> { using Type = float; };
+template <> struct TElemType<FVec4> { using Type = float; };
+
+template <typename T> bool typeMatch(u8 type);
+template <> bool typeMatch<int>(u8 type) { return type == Property::INTEGER || type == Property::ARRAY_INT; }
+template <> bool typeMatch<float>(u8 type) { return type == Property::FLOAT || type == Property::ARRAY_FLOAT; }
+template <> bool typeMatch<double>(u8 type) { return type == Property::DOUBLE || type == Property::ARRAY_DOUBLE; }
+template <> bool typeMatch<u64>(u8 type) { return type == Property::LONG || type == Property::ARRAY_LONG; }
+template <> bool typeMatch<i64>(u8 type) { return type == Property::LONG || type == Property::ARRAY_LONG; }
+
+template <typename T>
+static bool parseMemory(const Property& property, T* out, int max_size_bytes) {
+	assert(out);
+	const u32 count = property.getCount();
+	if (count == 0) return true;
+
+	if (!property.value.is_binary) return parseMemoryText(property, out, max_size_bytes);
+
+	if (!typeMatch<typename TElemType<T>::Type>(property.type))
+		return false;
+	int elem_size = 1;
+	switch (property.type) {
+		case 'l': elem_size = 8; break;
+		case 'd': elem_size = 8; break;
+		case 'f': elem_size = 4; break;
+		case 'i': elem_size = 4; break;
+		case 'L':
+		case 'D':
+		case 'F':
+		case 'I':
+			return parseMemoryLinked(property, out, max_size_bytes); 
+		default: return false;
+	}
+	if (count * elem_size != max_size_bytes) return false;
+	if (sizeof(T) % elem_size != 0) return false;
+
+	const u8* data = property.value.begin + sizeof(u32) * 3;
+	if (data > property.value.end) return false;
+
+	u32 enc = read_value<u32>(property.value.begin + 4);
+	u32 len = read_value<u32>(property.value.begin + 8);
+
+	if (enc == 0) {
+		if ((int)len > max_size_bytes) return false;
+		if (data + len > property.value.end) return false;
+		memcpy(out, data, len);
+		return true;
+	}
+	else if (enc == 1) {
+		if (int(elem_size * count) > max_size_bytes) return false;
+		return decompress(data, len, (u8*)out, elem_size * count);
+	}
+
+	return false;
+}
+
 template <typename T> static void parseTextArray(const Property& property, std::vector<T>* out)
 {
 	out->clear();
 	const u8* iter = property.value.begin;
-	for (int i = 0; i < property.count; ++i)
+	while (iter < property.value.end)
 	{
 		T val;
 		iter = (const u8*)fromString<T>((const char*)iter, (const char*)property.value.end, &val);
@@ -2151,83 +2941,140 @@ template <typename T> static void parseTextArray(const Property& property, std::
 	}
 }
 
-
-template <typename T> static bool parseTextArrayRaw(const Property& property, T* out_raw, int max_size)
-{
-	const u8* iter = property.value.begin;
-
-	T* out = out_raw;
-	while (iter < property.value.end)
-	{
-		iter = (const u8*)fromString<T>((const char*)iter, (const char*)property.value.end, out);
-		++out;
-		if (out - out_raw == max_size / sizeof(T)) return true;
-	}
-	return out - out_raw == max_size / sizeof(T);
-}
-
-
-template <typename T> static bool parseBinaryArray(const Property& property, std::vector<T>* out)
+template <typename T> static bool parseBinaryArrayLinked(const Property& property, std::vector<T>* out)
 {
 	assert(out);
-	if (property.value.is_binary)
-	{
-		u32 count = property.getCount();
-		int elem_size = 1;
-		switch (property.type)
-		{
-			case 'd': elem_size = 8; break;
-			case 'f': elem_size = 4; break;
-			case 'i': elem_size = 4; break;
-			default: return false;
-		}
-		int elem_count = sizeof(T) / elem_size;
-		out->resize(count / elem_count);
+	assert(property.value.is_binary);
 
-		if (count == 0) return true;
-		return parseArrayRaw(property, &(*out)[0], int(sizeof((*out)[0]) * out->size()));
-	}
-	else
+	int elem_size = 1;
+	switch (property.type)
 	{
+		case 'L': elem_size = 8; break;
+		case 'D': elem_size = 8; break;
+		case 'F': elem_size = 4; break;
+		case 'I': elem_size = 4; break;
+		default: return false;
+	}
+
+	if (sizeof(T) % elem_size != 0) return false;
+
+	const Property* p = &property;
+	while (p) {
+		T tmp;
+		for (u32 i = 0; i < sizeof(T) / elem_size; ++i) {
+			if (!p) return false;
+			if (p->type != property.type) return false;
+
+			memcpy((u8*)&tmp + elem_size * i, p->value.begin, elem_size);
+			p = p->next;
+		}
+		out->push_back(tmp);
+	}
+
+	return true;
+}
+
+template <typename T> static bool parseArray(const Property& property, std::vector<T>* out) {
+	assert(out);
+	if (!property.value.is_binary) {
 		parseTextArray(property, out);
 		return true;
 	}
+
+	if (!typeMatch<typename TElemType<T>::Type>(property.type)) return false;
+
+	int elem_size = 1;
+	switch (property.type)
+	{
+		case 'l': elem_size = 8; break;
+		case 'd': elem_size = 8; break;
+		case 'f': elem_size = 4; break;
+		case 'i': elem_size = 4; break;
+		case 'L':
+		case 'D':
+		case 'F':
+		case 'I':
+			return parseBinaryArrayLinked(property, out); 
+		default: return false;
+	}
+	u32 count = property.getCount();
+	out->resize(count * elem_size / sizeof(T));
+	if (count == 0) return true;
+
+	return parseMemory(property, out->data(), int(sizeof((*out)[0]) * out->size()));
 }
 
-
-template <typename T> static bool parseDoubleVecData(Property& property, std::vector<T>* out_vec, std::vector<float>* tmp)
-{
+template <typename T> static bool parseVecData(Property& property, std::vector<T>* out_vec) {
+	using ElemType = typename TElemType<T>::Type;
 	assert(out_vec);
-	if (!property.value.is_binary)
-	{
+	if (!property.value.is_binary) {
 		parseTextArray(property, out_vec);
 		return true;
 	}
 
-	if (property.type == 'd')
-	{
-		return parseBinaryArray(property, out_vec);
+	if (typeMatch<ElemType>(property.type)) return parseArray(property, out_vec);
+
+	if (property.type == 'f' || property.type == 'F') {
+		std::vector<float> tmp;
+		if (!parseArray(property, &tmp)) return false;
+		int elem_count = sizeof((*out_vec)[0]) / sizeof(ElemType);
+		out_vec->resize(tmp.size() / elem_count);
+		ElemType* out = (ElemType*)out_vec->data();
+		for (int i = 0, c = (int)tmp.size(); i < c; ++i) {
+			out[i] = static_cast<ElemType>(tmp[i]);
+		}
+		return true;
 	}
 
-	assert(property.type == 'f');
-	assert(sizeof((*out_vec)[0].x) == sizeof(double));
-	tmp->clear();
-	if (!parseBinaryArray(property, tmp)) return false;
-	int elem_count = sizeof((*out_vec)[0]) / sizeof((*out_vec)[0].x);
-	out_vec->resize(tmp->size() / elem_count);
-	double* out = &(*out_vec)[0].x;
-	for (int i = 0, c = (int)tmp->size(); i < c; ++i)
-	{
-		out[i] = (*tmp)[i];
+	if (property.type == 'd' || property.type == 'D') {
+		std::vector<double> tmp;
+		if (!parseArray(property, &tmp)) return false;
+		int elem_count = sizeof((*out_vec)[0]) / sizeof(ElemType);
+		out_vec->resize(tmp.size() / elem_count);
+		auto* out = (ElemType*)out_vec->data();
+		for (int i = 0, c = (int)tmp.size(); i < c; ++i) {
+			out[i] = static_cast<ElemType>(tmp[i]);
+		}
+		return true;
 	}
-	return true;
+
+	return false;
 }
 
+template <typename T> static bool parseVertexData(const Element& element, const char* name, const char* index_name, T& out, std::vector<ParseDataJob>& jobs) {
+	const Element* data_element = findChild(element, name);
+	if (!data_element || !data_element->first_property) return false;
+
+	const Element* mapping_element = findChild(element, "MappingInformationType");
+	const Element* reference_element = findChild(element, "ReferenceInformationType");
+	if (mapping_element && mapping_element->first_property) {
+		if (mapping_element->first_property->value == "ByPolygonVertex") {
+			out.mapping = VertexDataMapping::BY_POLYGON_VERTEX;
+		} else if (mapping_element->first_property->value == "ByPolygon") {
+			out.mapping = VertexDataMapping::BY_POLYGON;
+		} else if (mapping_element->first_property->value == "ByVertice" || mapping_element->first_property->value == "ByVertex") {
+			out.mapping = VertexDataMapping::BY_VERTEX;
+		} else {
+			return false;
+		}
+	}
+	if (reference_element && reference_element->first_property) {
+		if (reference_element->first_property->value == "IndexToDirect") {
+			const Element* indices_element = findChild(element, index_name);
+			if (indices_element && indices_element->first_property) {
+				if (!pushJob(jobs, *indices_element->first_property, out.indices)) return false;
+			}
+		} else if (reference_element->first_property->value != "Direct") {
+			return false;
+		}
+	}
+	return pushJob(jobs, *data_element->first_property, out.values);
+}
 
 static bool parseDouble(Property& property, double* out)
 {
 	assert(out);
-    if (property.value.is_binary)
+	if (property.value.is_binary)
 	{
 		int elem_size = 1;
 		switch (property.type)
@@ -2241,153 +3088,13 @@ static bool parseDouble(Property& property, double* out)
 		memcpy(out, data, elem_size);
 		return true;
 	}
-    else
-    {
-        fromString<double>((const char*)property.value.begin, (const char*)property.value.end, out);
-        return true;
-    }
-}
-
-
-template <typename T>
-static bool parseVertexData(const Element& element,
-	const char* name,
-	const char* index_name,
-	std::vector<T>* out,
-	std::vector<int>* out_indices,
-	GeometryImpl::VertexDataMapping* mapping,
-	std::vector<float>* tmp)
-{
-	assert(out);
-	assert(mapping);
-	const Element* data_element = findChild(element, name);
-	if (!data_element || !data_element->first_property) return false;
-
-	const Element* mapping_element = findChild(element, "MappingInformationType");
-	const Element* reference_element = findChild(element, "ReferenceInformationType");
-	out_indices->clear();
-	if (mapping_element && mapping_element->first_property)
-	{
-		if (mapping_element->first_property->value == "ByPolygonVertex")
-		{
-			*mapping = GeometryImpl::BY_POLYGON_VERTEX;
-		}
-		else if (mapping_element->first_property->value == "ByPolygon")
-		{
-			*mapping = GeometryImpl::BY_POLYGON;
-		}
-		else if (mapping_element->first_property->value == "ByVertice" ||
-				 mapping_element->first_property->value == "ByVertex")
-		{
-			*mapping = GeometryImpl::BY_VERTEX;
-		}
-		else
-		{
-			return false;
-		}
-	}
-	if (reference_element && reference_element->first_property)
-	{
-		if (reference_element->first_property->value == "IndexToDirect")
-		{
-			const Element* indices_element = findChild(element, index_name);
-			if (indices_element && indices_element->first_property)
-			{
-				if (!parseBinaryArray(*indices_element->first_property, out_indices)) return false;
-			}
-		}
-		else if (reference_element->first_property->value != "Direct")
-		{
-			return false;
-		}
-	}
-	return parseDoubleVecData(*data_element->first_property, out, tmp);
-}
-
-
-static int decodeIndex(int idx)
-{
-	return (idx < 0) ? (-idx - 1) : idx;
-}
-
-
-static int codeIndex(int idx, bool last)
-{
-	return last ? (-idx - 1) : idx;
-}
-
-
-template <typename T>
-static void splat(std::vector<T>* out,
-	GeometryImpl::VertexDataMapping mapping,
-	const std::vector<T>& data,
-	const std::vector<int>& indices,
-	const std::vector<int>& original_indices)
-{
-	assert(out);
-	assert(!data.empty());
-
-	if (mapping == GeometryImpl::BY_POLYGON_VERTEX)
-	{
-		if (indices.empty())
-		{
-			out->resize(data.size());
-			memcpy(&(*out)[0], &data[0], sizeof(data[0]) * data.size());
-		}
-		else
-		{
-			out->resize(indices.size());
-			int data_size = (int)data.size();
-			for (int i = 0, c = (int)indices.size(); i < c; ++i)
-			{
-				int index = indices[i];
-
-				if ((index < data_size) && (index >= 0))
-					(*out)[i] = data[index];
-				else
-					(*out)[i] = T();
-			}
-		}
-	}
-	else if (mapping == GeometryImpl::BY_VERTEX)
-	{
-		//  v0  v1 ...
-		// uv0 uv1 ...
-		assert(indices.empty());
-
-		out->resize(original_indices.size());
-
-		int data_size = (int)data.size();
-		for (int i = 0, c = (int)original_indices.size(); i < c; ++i)
-		{
-			int idx = decodeIndex(original_indices[i]);
-			if ((idx < data_size) && (idx >= 0)) //-V560
-				(*out)[i] = data[idx];
-			else
-				(*out)[i] = T();
-		}
-	}
 	else
 	{
-		assert(false);
+		fromString<double>((const char*)property.value.begin, (const char*)property.value.end, out);
+		return true;
 	}
 }
 
-
-template <typename T> static void remap(std::vector<T>* out, const std::vector<int>& map)
-{
-	if (out->empty()) return;
-
-	std::vector<T> old;
-	old.swap(*out);
-	int old_size = (int)old.size();
-	for (int i = 0, c = (int)map.size(); i < c; ++i)
-	{
-		out->push_back(map[i] < old_size ? old[map[i]] : T());
-	}
-}
-
-
 static OptionalError<Object*> parseAnimationCurve(const Scene& scene, const Element& element, Allocator& allocator)
 {
 	AnimationCurveImpl* curve = allocator.allocate<AnimationCurveImpl>(scene, element);
@@ -2418,331 +3125,32 @@ static OptionalError<Object*> parseAnimationCurve(const Scene& scene, const Elem
 	return curve;
 }
 
-
-static int getTriCountFromPoly(const std::vector<int>& indices, int* idx)
-{
-	int count = 1;
-	while (indices[*idx + 1 + count] >= 0)
-	{
-		++count;
-	}
-
-	*idx = *idx + 2 + count;
-	return count;
-}
-
-
-static void add(GeometryImpl::NewVertex& vtx, int index)
-{
-	if (vtx.index == -1)
-	{
-		vtx.index = index;
-	}
-	else if (vtx.next)
-	{
-		add(*vtx.next, index);
-	}
-	else
-	{
-		vtx.next = new GeometryImpl::NewVertex;
-		vtx.next->index = index;
-	}
-}
-
-
-static void triangulate(
-	const std::vector<int>& old_indices,
-	std::vector<int>* to_old_vertices,
-	std::vector<int>* to_old_indices)
-{
-	assert(to_old_vertices);
-	assert(to_old_indices);
-
-	auto getIdx = [&old_indices](int i) -> int {
-		int idx = old_indices[i];
-		return decodeIndex(idx);
-	};
-
-	int in_polygon_idx = 0;
-	for (int i = 0; i < old_indices.size(); ++i)
-	{
-		int idx = getIdx(i);
-		if (in_polygon_idx <= 2) //-V1051
-		{
-			to_old_vertices->push_back(idx);
-			to_old_indices->push_back(i);
-		}
-		else
-		{
-			to_old_vertices->push_back(old_indices[i - in_polygon_idx]);
-			to_old_indices->push_back(i - in_polygon_idx);
-			to_old_vertices->push_back(old_indices[i - 1]);
-			to_old_indices->push_back(i - 1);
-			to_old_vertices->push_back(idx);
-			to_old_indices->push_back(i);
-		}
-		++in_polygon_idx;
-		if (old_indices[i] < 0)
-		{
-			if (in_polygon_idx <= 2) {
-				// invalid polygon, let's pop it
-				to_old_vertices->pop_back();
-				to_old_indices->pop_back();
-				if (in_polygon_idx == 2) {
-					to_old_vertices->pop_back();
-					to_old_indices->pop_back();
-				}
-			}
-			in_polygon_idx = 0;
-		}
-	}
-}
-
-
-static void buildGeometryVertexData(
-	GeometryImpl* geom,
-	const std::vector<Vec3>& vertices,
-	const std::vector<int>& original_indices,
-	std::vector<int>& to_old_indices,
-	bool triangulationEnabled)
-{
-	std::vector<int> to_old_vertices;
-
-	if (triangulationEnabled) {
-		triangulate(original_indices, &to_old_vertices, &to_old_indices);
-		geom->vertices.resize(to_old_vertices.size());
-		geom->indices.resize(geom->vertices.size());
-		for (int i = 0, c = (int)to_old_vertices.size(); i < c; ++i)
-		{
-			geom->vertices[i] = vertices[to_old_vertices[i]];
-			geom->indices[i] = codeIndex(i, i % 3 == 2);
-		}
-	} else {
-		geom->vertices = vertices;
-		to_old_vertices.resize(original_indices.size());
-		for (size_t i = 0; i < original_indices.size(); ++i) {
-			to_old_vertices[i] = decodeIndex(original_indices[i]);
-		}
-		geom->indices = original_indices;
-		to_old_indices.resize(original_indices.size());
-		iota(to_old_indices.begin(), to_old_indices.end(), 0);
-	}
-
-	geom->to_new_vertices.resize(vertices.size()); // some vertices can be unused, so this isn't necessarily the same size as to_old_vertices.
-	for (int i = 0, c = (int)to_old_vertices.size(); i < c; ++i)
-	{
-		int old = to_old_vertices[i];
-		add(geom->to_new_vertices[old], i);
-	}
-}
-
-
-static OptionalError<Object*> parseGeometryMaterials(
-	GeometryImpl* geom,
-	const Element& element,
-	const std::vector<int>& original_indices)
-{
-	const Element* layer_material_element = findChild(element, "LayerElementMaterial");
-	if (layer_material_element)
-	{
-		const Element* mapping_element = findChild(*layer_material_element, "MappingInformationType");
-		const Element* reference_element = findChild(*layer_material_element, "ReferenceInformationType");
-
-		if (!mapping_element || !reference_element) return Error("Invalid LayerElementMaterial");
-
-		if (mapping_element->first_property->value == "ByPolygon" &&
-			reference_element->first_property->value == "IndexToDirect")
-		{
-			geom->materials.reserve(geom->vertices.size() / 3);
-			for (int& i : geom->materials) i = -1;
-
-			const Element* indices_element = findChild(*layer_material_element, "Materials");
-			if (!indices_element || !indices_element->first_property) return Error("Invalid LayerElementMaterial");
-
-			std::vector<int> int_tmp;
-			if (!parseBinaryArray(*indices_element->first_property, &int_tmp)) return Error("Failed to parse material indices");
-
-			int tmp_i = 0;
-			for (int poly = 0, c = (int)int_tmp.size(); poly < c; ++poly)
-			{
-				int tri_count = getTriCountFromPoly(original_indices, &tmp_i);
-				for (int i = 0; i < tri_count; ++i) {
-					geom->materials.push_back(int_tmp[poly]);
-				}
-			}
-		}
-		else
-		{
-			if (mapping_element->first_property->value != "AllSame") return Error("Mapping not supported");
-		}
-	}
-	return {nullptr};
-}
-
-
-static OptionalError<Object*> parseGeometryUVs(
-	GeometryImpl* geom,
-	const Element& element,
-	const std::vector<int>& original_indices,
-	const std::vector<int>& to_old_indices,
-	Temporaries* tmp)
-{
-	const Element* layer_uv_element = findChild(element, "LayerElementUV");
-	while (layer_uv_element)
-	{
-		const int uv_index =
-			layer_uv_element->first_property ? layer_uv_element->first_property->getValue().toInt() : 0;
-		if (uv_index >= 0 && uv_index < Geometry::s_uvs_max)
-		{
-			std::vector<Vec2>& uvs = geom->uvs[uv_index];
-
-			tmp->v2.clear();
-			tmp->i.clear();
-			GeometryImpl::VertexDataMapping mapping;
-			if (!parseVertexData(*layer_uv_element, "UV", "UVIndex", &tmp->v2, &tmp->i, &mapping, &tmp->f))
-				return Error("Invalid UVs");
-			if (!tmp->v2.empty() && (tmp->i.empty() || tmp->i[0] != -1))
-			{
-				uvs.resize(tmp->i.empty() ? tmp->v2.size() : tmp->i.size());
-				splat(&uvs, mapping, tmp->v2, tmp->i, original_indices);
-				remap(&uvs, to_old_indices);
-			}
-		}
-
-		do
-		{
-			layer_uv_element = layer_uv_element->sibling;
-		} while (layer_uv_element && layer_uv_element->id != "LayerElementUV");
-	}
-	return {nullptr};
-}
-
-
-static OptionalError<Object*> parseGeometryTangents(
-	GeometryImpl* geom,
-	const Element& element,
-	const std::vector<int>& original_indices,
-	const std::vector<int>& to_old_indices,
-	Temporaries* tmp)
-{
-	const Element* layer_tangent_element = findChild(element, "LayerElementTangents");
-	if (!layer_tangent_element ) {
-		layer_tangent_element = findChild(element, "LayerElementTangent");
-	}
-	if (layer_tangent_element)
-	{
-		GeometryImpl::VertexDataMapping mapping;
-		if (findChild(*layer_tangent_element, "Tangents"))
-		{
-			if (!parseVertexData(*layer_tangent_element, "Tangents", "TangentsIndex", &tmp->v3, &tmp->i, &mapping, &tmp->f))
-				return Error("Invalid tangets");
-		}
-		else
-		{
-			if (!parseVertexData(*layer_tangent_element, "Tangent", "TangentIndex", &tmp->v3, &tmp->i, &mapping, &tmp->f))
-				return Error("Invalid tangets");
-		}
-		if (!tmp->v3.empty())
-		{
-			splat(&geom->tangents, mapping, tmp->v3, tmp->i, original_indices);
-			remap(&geom->tangents, to_old_indices);
-		}
-	}
-	return {nullptr};
-}
-
-
-static OptionalError<Object*> parseGeometryColors(
-	GeometryImpl* geom,
-	const Element& element,
-	const std::vector<int>& original_indices,
-	const std::vector<int>& to_old_indices,
-	Temporaries* tmp)
-{
-	const Element* layer_color_element = findChild(element, "LayerElementColor");
-	if (layer_color_element)
-	{
-		GeometryImpl::VertexDataMapping mapping;
-		if (!parseVertexData(*layer_color_element, "Colors", "ColorIndex", &tmp->v4, &tmp->i, &mapping, &tmp->f))
-			return Error("Invalid colors");
-		if (!tmp->v4.empty())
-		{
-			splat(&geom->colors, mapping, tmp->v4, tmp->i, original_indices);
-			remap(&geom->colors, to_old_indices);
-		}
-	}
-	return {nullptr};
-}
-
-
-static OptionalError<Object*> parseGeometryNormals(
-	GeometryImpl* geom,
-	const Element& element,
-	const std::vector<int>& original_indices,
-	const std::vector<int>& to_old_indices,
-	Temporaries* tmp)
-{
-	const Element* layer_normal_element = findChild(element, "LayerElementNormal");
-	if (layer_normal_element)
-	{
-		GeometryImpl::VertexDataMapping mapping;
-		if (!parseVertexData(*layer_normal_element, "Normals", "NormalsIndex", &tmp->v3, &tmp->i, &mapping, &tmp->f))
-			return Error("Invalid normals");
-		if (!tmp->v3.empty())
-		{
-			splat(&geom->normals, mapping, tmp->v3, tmp->i, original_indices);
-			remap(&geom->normals, to_old_indices);
-		}
-	}
-	return {nullptr};
-}
-
-
-static OptionalError<Object*> parseGeometry(const Element& element, bool triangulate, GeometryImpl* geom)
-{
+static OptionalError<Object*> parseGeometry(const Element& element, GeometryImpl& geom, std::vector<ParseDataJob> &jobs, Allocator& allocator) {
 	assert(element.first_property);
 
 	const Element* vertices_element = findChild(element, "Vertices");
 	if (!vertices_element || !vertices_element->first_property)
 	{
-		return geom;
+		return &geom;
 	}
 
 	const Element* polys_element = findChild(element, "PolygonVertexIndex");
 	if (!polys_element || !polys_element->first_property) return Error("Indices missing");
 
-	std::vector<Vec3> vertices;
-	std::vector<int> original_indices;
-	std::vector<int> to_old_indices;
-	Temporaries tmp;
-	if (!parseDoubleVecData(*vertices_element->first_property, &vertices, &tmp.f)) return Error("Failed to parse vertices");
-	if (!parseBinaryArray(*polys_element->first_property, &original_indices)) return Error("Failed to parse indices");
+	if (!pushJob(jobs, *vertices_element->first_property, geom.positions.values)) return Error("Invalid vertices");
+	if (!pushJob(jobs, *polys_element->first_property, geom.positions.indices)) return Error("Invalid vertices");
 
-	buildGeometryVertexData(geom, vertices, original_indices, to_old_indices, triangulate);
+	if (!parseGeometryMaterials(geom, element, jobs)) return Error("Invalid materials");
+	if (!parseGeometryUVs(geom, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryTangents(geom, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryColors(geom, element, jobs)) return Error("Invalid vertex attributes");
+	if (!parseGeometryNormals(geom, element, jobs)) return Error("Invalid vertex attributes");
 
-	OptionalError<Object*> materialParsingError = parseGeometryMaterials(geom, element, original_indices);
-	if (materialParsingError.isError()) return materialParsingError;
-
-	OptionalError<Object*> uvParsingError = parseGeometryUVs(geom, element, original_indices, to_old_indices, &tmp);
-	if (uvParsingError.isError()) return uvParsingError;
-
-	OptionalError<Object*> tangentsParsingError = parseGeometryTangents(geom, element, original_indices, to_old_indices, &tmp);
-	if (tangentsParsingError.isError()) return tangentsParsingError;
-
-	OptionalError<Object*> colorsParsingError = parseGeometryColors(geom, element, original_indices, to_old_indices, &tmp);
-	if (colorsParsingError.isError()) return colorsParsingError;
-
-	OptionalError<Object*> normalsParsingError = parseGeometryNormals(geom, element, original_indices, to_old_indices, &tmp);
-	if (normalsParsingError.isError()) return normalsParsingError;
-
-	return geom;
+	return &geom;
 }
 
 
-bool ShapeImpl::postprocess(GeometryImpl* geom, Allocator& allocator)
-{
-	assert(geom);
-
+bool ShapeImpl::postprocess(GeometryImpl& geom, Allocator& allocator) {
 	const Element* vertices_element = findChild((const Element&)element, "Vertices");
 	const Element* normals_element = findChild((const Element&)element, "Normals");
 	const Element* indexes_element = findChild((const Element&)element, "Indexes");
@@ -2751,88 +3159,69 @@ bool ShapeImpl::postprocess(GeometryImpl* geom, Allocator& allocator)
 	{
 		return false;
 	}
-   
-	allocator.vec3_tmp.clear(); // old vertices
-	allocator.vec3_tmp2.clear(); // old normals
-	allocator.int_tmp.clear(); // old indices
-	if (!parseDoubleVecData(*vertices_element->first_property, &allocator.vec3_tmp, &allocator.tmp)) return true;
-	if (normals_element && !parseDoubleVecData(*normals_element->first_property, &allocator.vec3_tmp2, &allocator.tmp)) return true;
-	if (!parseBinaryArray(*indexes_element->first_property, &allocator.int_tmp)) return true;
 
-	if (allocator.vec3_tmp.size() != allocator.int_tmp.size() || allocator.vec3_tmp2.size() != allocator.int_tmp.size()) return false;
-
-	vertices = geom->vertices;
-	normals = geom->normals;
-
-	Vec3* vr = &allocator.vec3_tmp[0];
-	Vec3* nr = normals_element ? &allocator.vec3_tmp2[0] : nullptr;
-	int* ir = &allocator.int_tmp[0];
-	for (int i = 0, c = (int)allocator.int_tmp.size(); i < c; ++i)
-	{
-		int old_idx = ir[i];
-		GeometryImpl::NewVertex* n = &geom->to_new_vertices[old_idx];
-		if (n->index == -1) continue; // skip vertices which aren't indexed.
-		while (n)
-		{
-			vertices[n->index] = vertices[n->index] + vr[i];
-			if (normals_element) normals[n->index] = normals[n->index] + nr[i];
-			n = n->next;
-		}
-	}
+	if (!parseVecData(*vertices_element->first_property, &vertices)) return false;
+	if (normals_element && !parseVecData(*normals_element->first_property, &normals)) return false;
+	if (!parseVecData(*indexes_element->first_property, &indices)) return false;
 
 	return true;
 }
 
-
-static bool isString(const Property* prop)
+static bool parseConnections(const Element& root, Scene& scene)
 {
-	if (!prop) return false;
-	return prop->getType() == Property::STRING;
-}
-
-
-static bool isLong(const Property* prop)
-{
-	if (!prop) return false;
-	return prop->getType() == Property::LONG;
-}
-
-
-static bool parseConnections(const Element& root, Scene* scene)
-{
-	assert(scene);
-
 	const Element* connections = findChild(root, "Connections");
 	if (!connections) return true;
 
-	scene->m_connections.reserve(1024);
+	scene.m_connections.reserve(1024);
 	const Element* connection = connections->child;
 	while (connection)
 	{
-		if (!isString(connection->first_property)
-			|| !isLong(connection->first_property->next)
-			|| !isLong(connection->first_property->next->next))
+		if (!isString(connection->first_property) || !connection->first_property->next || !connection->first_property->next->next)
 		{
 			Error::s_message = "Invalid connection";
 			return false;
 		}
 
 		Scene::Connection c;
-		c.from = connection->first_property->next->value.toU64();
-		c.to = connection->first_property->next->next->value.toU64();
+		c.from_object = toObjectID(scene, connection->first_property->next);
 		if (connection->first_property->value == "OO")
 		{
 			c.type = Scene::Connection::OBJECT_OBJECT;
+			c.to_object = toObjectID(scene, connection->first_property->next->next);
 		}
 		else if (connection->first_property->value == "OP")
 		{
-			c.type = Scene::Connection::OBJECT_PROPERTY;
 			if (!connection->first_property->next->next->next)
 			{
 				Error::s_message = "Invalid connection";
 				return false;
 			}
-			c.property = connection->first_property->next->next->next->value;
+			c.type = Scene::Connection::OBJECT_PROPERTY;
+			c.to_object = toObjectID(scene, connection->first_property->next->next);
+			c.to_property = connection->first_property->next->next->next->value;
+		}
+		else if (connection->first_property->value == "PO")
+		{
+			if (!connection->first_property->next->next->next)
+			{
+				Error::s_message = "Invalid connection";
+				return false;
+			}
+			c.type = Scene::Connection::PROPERTY_OBJECT;
+			c.from_property = connection->first_property->next->next->value;
+			c.to_object = toObjectID(scene, connection->first_property->next->next->next);
+		}
+		else if (connection->first_property->value == "PP")
+		{
+			if (!connection->first_property->next->next->next->next)
+			{
+				Error::s_message = "Invalid connection";
+				return false;
+			}
+			c.type = Scene::Connection::PROPERTY_PROPERTY;
+			c.from_property = connection->first_property->next->next->value;
+			c.to_object = toObjectID(scene, connection->first_property->next->next->next);
+			c.to_property = connection->first_property->next->next->next->next->value;
 		}
 		else
 		{
@@ -2840,7 +3229,7 @@ static bool parseConnections(const Element& root, Scene* scene)
 			Error::s_message = "Not supported";
 			return false;
 		}
-		scene->m_connections.push_back(c);
+		scene.m_connections.push_back(c);
 
 		connection = connection->sibling;
 	}
@@ -2848,9 +3237,9 @@ static bool parseConnections(const Element& root, Scene* scene)
 }
 
 
-static bool parseTakes(Scene* scene)
+static bool parseTakes(Scene& scene)
 {
-	const Element* takes = findChild((const Element&)*scene->getRootElement(), "Takes");
+	const Element* takes = findChild((const Element&)*scene.getRootElement(), "Takes");
 	if (!takes) return true;
 
 	const Element* object = takes->child;
@@ -2901,7 +3290,7 @@ static bool parseTakes(Scene* scene)
 				take.reference_time_to = fbxTimeToSeconds(reference_time->first_property->next->value.toI64());
 			}
 
-			scene->m_take_infos.push_back(take);
+			scene.m_take_infos.push_back(take);
 		}
 
 		object = object->sibling;
@@ -2935,117 +3324,109 @@ static float getFramerateFromTimeMode(FrameRate time_mode, float custom_frame_ra
 }
 
 
-#define get_property(name, field, type, getter) if (node->first_property->value == name) \
-    { \
-        IElementProperty* prop = node->getProperty(4); \
-        if (prop) \
-        { \
-	        DataView value = prop->getValue(); \
-	        field = (type)value.getter(); \
-        } \
-    }
-#define get_time_property(name, field, type, getter) if (node->first_property->value == name) \
-    { \
-        IElementProperty* prop = node->getProperty(4); \
-        if (prop) \
-        { \
-	        DataView value = prop->getValue(); \
-	        field = fbxTimeToSeconds((type)value.getter()); \
-        } \
-    }
-#define get_text_property(name, field) if (node->first_property->value == name) \
-    { \
-        IElementProperty* prop = node->getProperty(4); \
-        if (prop) \
-        { \
-	        DataView value = prop->getValue(); \
-            value.toString(field); \
-        } \
-    }
+static void parseGlobalSettings(const Element& root, Scene* scene)
+{
+	const Element* settings = findChild(root, "GlobalSettings");
+	if (!settings) return;
+
+	bool is_p60 = false;
+	const Element* props = findChild(*settings, "Properties70");
+	if (!props) {
+		is_p60 = true;
+		props = findChild(*settings, "Properties60");
+		if (!props) return;
+	}
+
+	for (Element* node = props->child; node; node = node->sibling) {
+		if (!node->first_property) continue;
+
+		#define get_property(name, field, type, getter) if(node->first_property->value == name) \
+		{ \
+			IElementProperty* prop = node->getProperty(scene->version <= 6100 ? 3 : 4); \
+			if (prop) \
+			{ \
+				DataView value = prop->getValue(); \
+				scene->m_settings.field = (type)value.getter(); \
+			} \
+		}
+
+		#define get_time_property(name, field, type, getter) if(node->first_property->value == name) \
+		{ \
+			IElementProperty* prop = node->getProperty(scene->version <= 6100 ? 3 : 4); \
+			if (prop) \
+			{ \
+				DataView value = prop->getValue(); \
+				scene->m_settings.field = fbxTimeToSeconds((type)value.getter()); \
+			} \
+		}
+
+		get_property("UpAxis", UpAxis, UpVector, toInt);
+		get_property("UpAxisSign", UpAxisSign, int, toInt);
+		get_property("FrontAxis", FrontAxis, int, toInt);
+		get_property("FrontAxisSign", FrontAxisSign, int, toInt);
+		get_property("CoordAxis", CoordAxis, CoordSystem, toInt);
+		get_property("CoordAxisSign", CoordAxisSign, int, toInt);
+		get_property("OriginalUpAxis", OriginalUpAxis, int, toInt);
+		get_property("OriginalUpAxisSign", OriginalUpAxisSign, int, toInt);
+		get_property("UnitScaleFactor", UnitScaleFactor, float, toDouble);
+		get_property("OriginalUnitScaleFactor", OriginalUnitScaleFactor, float, toDouble);
+		get_time_property("TimeSpanStart", TimeSpanStart, u64, toU64);
+		get_time_property("TimeSpanStop", TimeSpanStop, u64, toU64);
+		get_property("TimeMode", TimeMode, FrameRate, toInt);
+		get_property("CustomFrameRate", CustomFrameRate, float, toDouble);
+
+		#undef get_property
+		#undef get_time_property
+
+		scene->m_scene_frame_rate = getFramerateFromTimeMode(scene->m_settings.TimeMode, scene->m_settings.CustomFrameRate);
+	}
+}
 
 
 static void parseGlobalInfo(const Element& root, Scene* scene)
 {
     for (Element* header = root.child; header; header = header->sibling)
-	{
-		if (header->id != "FBXHeaderExtension")
-		    continue;
-	    for (Element* info = header->child; info; info = info->sibling)
-	    {
-		    if (info->id != "SceneInfo")
-		        continue;
-			for (Element* props70 = info->child; props70; props70 = props70->sibling)
-			{
-				if (props70->id != "Properties70")
-				    continue;
-				for (Element* node = props70->child; node; node = node->sibling)
-				{
-					if (!node->first_property)
-						continue;
-					get_text_property("Original|ApplicationVendor", scene->m_info.AppVendor);
-					get_text_property("Original|ApplicationName", scene->m_info.AppName);
-					get_text_property("Original|ApplicationVersion", scene->m_info.AppVersion);
-				}
-				break;
-			}
-			break;
-	    }
-		break;
+    {
+        if (header->id != "FBXHeaderExtension")
+            continue;
+        for (Element* info = header->child; info; info = info->sibling)
+        {
+            if (info->id != "SceneInfo")
+                continue;
+            for (Element* props70 = info->child; props70; props70 = props70->sibling)
+            {
+                if (props70->id != "Properties70")
+                    continue;
+                for (Element* node = props70->child; node; node = node->sibling)
+                {
+                    if (!node->first_property)
+                        continue;
+
+                    #define get_text_property(name, field) if (node->first_property->value == name) \
+                    { \
+                        IElementProperty* prop = node->getProperty(4); \
+                        if (prop) \
+                        { \
+	                        DataView value = prop->getValue(); \
+                            value.toString(field); \
+                        } \
+                    }
+
+                    get_text_property("Original|ApplicationVendor", scene->m_info.AppVendor);
+                    get_text_property("Original|ApplicationName", scene->m_info.AppName);
+                    get_text_property("Original|ApplicationVersion", scene->m_info.AppVersion);
+
+                    #undef get_text_property
+                }
+                break;
+            }
+            break;
+        }
+        break;
     }
 }
 
 
-static void parseGlobalSettings(const Element& root, Scene* scene)
-{
-	for (Element* settings = root.child; settings; settings = settings->sibling)
-	{
-		if (settings->id == "GlobalSettings")
-		{
-			for (Element* props70 = settings->child; props70; props70 = props70->sibling)
-			{
-				if (props70->id == "Properties70")
-				{
-					for (Element* node = props70->child; node; node = node->sibling)
-					{
-						if (!node->first_property)
-							continue;
-						get_property("UpAxis", scene->m_settings.UpAxis, UpVector, toInt);
-						get_property("UpAxisSign", scene->m_settings.UpAxisSign, int, toInt);
-						get_property("FrontAxis", scene->m_settings.FrontAxis, FrontVector, toInt);
-						get_property("FrontAxisSign", scene->m_settings.FrontAxisSign, int, toInt);
-						get_property("CoordAxis", scene->m_settings.CoordAxis, CoordSystem, toInt);
-						get_property("CoordAxisSign", scene->m_settings.CoordAxisSign, int, toInt);
-						get_property("OriginalUpAxis", scene->m_settings.OriginalUpAxis, int, toInt);
-						get_property("OriginalUpAxisSign", scene->m_settings.OriginalUpAxisSign, int, toInt);
-						get_property("UnitScaleFactor", scene->m_settings.UnitScaleFactor, float, toDouble);
-						get_property("OriginalUnitScaleFactor", scene->m_settings.OriginalUnitScaleFactor, float, toDouble);
-						get_time_property("TimeSpanStart", scene->m_settings.TimeSpanStart, u64, toU64);
-						get_time_property("TimeSpanStop", scene->m_settings.TimeSpanStop, u64, toU64);
-						get_property("TimeMode", scene->m_settings.TimeMode, FrameRate, toInt);
-						get_property("CustomFrameRate", scene->m_settings.CustomFrameRate, float, toDouble);
-						scene->m_scene_frame_rate = getFramerateFromTimeMode(scene->m_settings.TimeMode, scene->m_settings.CustomFrameRate);
-					}
-					break;
-				}
-			}
-			break;
-		}
-	}
-}
-
-
-#undef get_property
-#undef get_time_property
-
-
-struct ParseGeometryJob {
-	const Element* element;
-	bool triangulate;
-	GeometryImpl* geom;
-	u64 id;
-	bool is_error;
-};
-
 void sync_job_processor(JobFunction fn, void*, void* data, u32 size, u32 count) {
 	u8* ptr = (u8*)data;
 	for(u32 i = 0; i < count; ++i) {
@@ -3054,164 +3435,199 @@ void sync_job_processor(JobFunction fn, void*, void* data, u32 size, u32 count)
 	}
 }
 
-static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator& allocator, JobProcessor job_processor, void* job_user_ptr)
-{
+static bool parseObjects(const Element& root, Scene& scene, u16 flags, Allocator& allocator, JobProcessor job_processor, void* job_user_ptr) {
 	if (!job_processor) job_processor = &sync_job_processor;
-	const bool triangulate = (flags & (u64)LoadFlags::TRIANGULATE) != 0;
-	const bool ignore_geometry = (flags & (u64)LoadFlags::IGNORE_GEOMETRY) != 0;
-	const bool ignore_blend_shapes = (flags & (u64)LoadFlags::IGNORE_BLEND_SHAPES) != 0;
+
+	const bool ignore_geometry = (flags & (u16)LoadFlags::IGNORE_GEOMETRY) != 0;
+	const bool ignore_blend_shapes = (flags & (u16)LoadFlags::IGNORE_BLEND_SHAPES) != 0;
+	const bool ignore_cameras = (flags & (u16)LoadFlags::IGNORE_CAMERAS) != 0;
+	const bool ignore_lights = (flags & (u16)LoadFlags::IGNORE_LIGHTS) != 0;
+	const bool ignore_textures = (flags & (u16)LoadFlags::IGNORE_TEXTURES) != 0;
+	const bool ignore_skin = (flags & (u16)LoadFlags::IGNORE_SKIN) != 0;
+	const bool ignore_bones = (flags & (u16)LoadFlags::IGNORE_BONES) != 0;
+	const bool ignore_pivots = (flags & (u16)LoadFlags::IGNORE_PIVOTS) != 0;
+	const bool ignore_animations = (flags & (u16)LoadFlags::IGNORE_ANIMATIONS) != 0;
+	const bool ignore_materials = (flags & (u16)LoadFlags::IGNORE_MATERIALS) != 0;
+	const bool ignore_poses = (flags & (u16)LoadFlags::IGNORE_POSES) != 0;
+	const bool ignore_videos = (flags & (u16)LoadFlags::IGNORE_VIDEOS) != 0;
+	const bool ignore_limbs = (flags & (u16)LoadFlags::IGNORE_LIMBS) != 0;
+	const bool ignore_meshes = (flags & (u16)LoadFlags::IGNORE_MESHES) != 0;
+	const bool ignore_models = (flags & (u16)LoadFlags::IGNORE_MODELS) != 0;
+
 	const Element* objs = findChild(root, "Objects");
 	if (!objs) return true;
 
-	scene->m_root = allocator.allocate<Root>(*scene, root);
-	scene->m_root->id = 0;
-	scene->m_object_map[0] = {&root, scene->m_root};
+	scene.m_root = allocator.allocate<Root>(scene, root);
+	scene.m_root->id = 0;
+	scene.m_object_map[0] = {&root, scene.m_root};
 
 	const Element* object = objs->child;
 	while (object)
 	{
-		if (!isLong(object->first_property))
-		{
-			Error::s_message = "Invalid";
-			return false;
-		}
+		if (object->first_property) {
+			if (!isLong(object->first_property) && !isString(object->first_property))
+			{
+				Error::s_message = "Invalid ID";
+				return false;
+			}
 
-		u64 id = object->first_property->value.toU64();
-		scene->m_object_map[id] = {object, nullptr};
+			u64 id = toObjectID(scene, object->first_property);
+			scene.m_object_map[id] = {object, nullptr};
+		}
 		object = object->sibling;
 	}
 
-	std::vector<ParseGeometryJob> parse_geom_jobs;
-	for (auto iter : scene->m_object_map)
+	std::vector<ParseDataJob> jobs;
+
+	for (auto iter : scene.m_object_map)
 	{
 		OptionalError<Object*> obj = nullptr;
 
-		if (iter.second.object == scene->m_root) continue;
+		if (iter.second.object == scene.m_root) continue;
 
-		if (iter.second.element->id == "Geometry")
+		if (iter.second.element->id == "Geometry" && !ignore_geometry)
 		{
 			Property* last_prop = iter.second.element->first_property;
 			while (last_prop->next) last_prop = last_prop->next;
-			if (last_prop && last_prop->value == "Mesh" && !ignore_geometry)
+			if (last_prop && last_prop->value == "Mesh")
 			{
-				GeometryImpl* geom = allocator.allocate<GeometryImpl>(*scene, *iter.second.element);
-				ParseGeometryJob job {iter.second.element, triangulate, geom, iter.first, false};
-				parse_geom_jobs.push_back(job);
-				continue;
+				GeometryImpl* geom = allocator.allocate<GeometryImpl>(scene, *iter.second.element);
+				parseGeometry(*iter.second.element, *geom, jobs, allocator);
+				obj = geom;
+				scene.m_geometries.push_back(geom);
 			}
-			if (last_prop && last_prop->value == "Shape" && !ignore_geometry)
+			else if (last_prop && last_prop->value == "Shape")
 			{
-				obj = allocator.allocate<ShapeImpl>(*scene, *iter.second.element);
+				obj = allocator.allocate<ShapeImpl>(scene, *iter.second.element);
 			}
 		}
-		else if (iter.second.element->id == "Material")
+		else if (iter.second.element->id == "Material" && !ignore_materials)
 		{
-			obj = parseMaterial(*scene, *iter.second.element, allocator);
+			obj = parseMaterial(scene, *iter.second.element, allocator);
 		}
-		else if (iter.second.element->id == "AnimationStack")
+		else if (iter.second.element->id == "AnimationStack" && !ignore_animations)
 		{
-			obj = parse<AnimationStackImpl>(*scene, *iter.second.element, allocator);
+			obj = allocator.allocate<AnimationStackImpl>(scene, *iter.second.element);
 			if (!obj.isError())
 			{
 				AnimationStackImpl* stack = (AnimationStackImpl*)obj.getValue();
-				scene->m_animation_stacks.push_back(stack);
+				scene.m_animation_stacks.push_back(stack);
 			}
 		}
-		else if (iter.second.element->id == "AnimationLayer")
+		else if (iter.second.element->id == "AnimationLayer" && !ignore_animations)
 		{
-			obj = parse<AnimationLayerImpl>(*scene, *iter.second.element, allocator);
+			obj = allocator.allocate<AnimationLayerImpl>(scene, *iter.second.element);
 		}
-		else if (iter.second.element->id == "AnimationCurve")
+		else if (iter.second.element->id == "AnimationCurve" && !ignore_animations)
 		{
-			obj = parseAnimationCurve(*scene, *iter.second.element, allocator);
+			obj = parseAnimationCurve(scene, *iter.second.element, allocator);
 		}
-		else if (iter.second.element->id == "AnimationCurveNode")
+		else if (iter.second.element->id == "AnimationCurveNode" && !ignore_animations)
 		{
-			obj = parse<AnimationCurveNodeImpl>(*scene, *iter.second.element, allocator);
+			obj = allocator.allocate<AnimationCurveNodeImpl>(scene, *iter.second.element);
 		}
 		else if (iter.second.element->id == "Deformer")
 		{
 			IElementProperty* class_prop = iter.second.element->getProperty(2);
+			if (!class_prop) class_prop = iter.second.element->getProperty(1);
 
 			if (class_prop)
 			{
 				if (class_prop->getValue() == "Cluster")
-					obj = parseCluster(*scene, *iter.second.element, allocator);
+					obj = parseCluster(scene, *iter.second.element, allocator);
 				else if (class_prop->getValue() == "Skin")
-					obj = parse<SkinImpl>(*scene, *iter.second.element, allocator);
+					obj = allocator.allocate<SkinImpl>(scene, *iter.second.element);
 				else if (class_prop->getValue() == "BlendShape" && !ignore_blend_shapes)
-					obj = parse<BlendShapeImpl>(*scene, *iter.second.element, allocator);
+					obj = allocator.allocate<BlendShapeImpl>(scene, *iter.second.element);
 				else if (class_prop->getValue() == "BlendShapeChannel" && !ignore_blend_shapes)
-					obj = parse<BlendShapeChannelImpl>(*scene, *iter.second.element, allocator);
+					obj = allocator.allocate<BlendShapeChannelImpl>(scene, *iter.second.element);
 			}
 		}
 		else if (iter.second.element->id == "NodeAttribute")
 		{
-			obj = parseNodeAttribute(*scene, *iter.second.element, allocator);
+			Property* last_prop = iter.second.element->first_property;
+			while (last_prop->next) last_prop = last_prop->next;
+			if (last_prop)
+			{
+				if (last_prop->value == "Light" && !ignore_lights)
+				{
+					obj = parseLight(scene, *iter.second.element, allocator);
+				}
+				else if (last_prop->value == "Camera" && !ignore_cameras)
+				{
+					obj = parseCamera(scene, *iter.second.element, allocator);
+				}
+			}
+			else
+			{
+				obj = parseNodeAttribute(scene, *iter.second.element, allocator);
+			}
 		}
-		else if (iter.second.element->id == "Model")
+		else if (iter.second.element->id == "Model" && !ignore_models)
 		{
 			IElementProperty* class_prop = iter.second.element->getProperty(2);
+			if (!class_prop) class_prop = iter.second.element->getProperty(1);
 
 			if (class_prop)
 			{
-				if (class_prop->getValue() == "Mesh")
+				if (class_prop->getValue() == "Mesh" && !ignore_meshes)
 				{
-					obj = parseMesh(*scene, *iter.second.element, allocator);
-					if (!obj.isError())
-					{
+					obj = parseMesh(scene, *iter.second.element, jobs, allocator);
+					if (!obj.isError()) {
 						Mesh* mesh = (Mesh*)obj.getValue();
-						scene->m_meshes.push_back(mesh);
+						scene.m_meshes.push_back(mesh);
 						obj = mesh;
 					}
 				}
-				else if (class_prop->getValue() == "LimbNode")
-					obj = parseLimbNode(*scene, *iter.second.element, allocator);
+				else if ((class_prop->getValue() == "LimbNode" || class_prop->getValue() == "Root") && !ignore_limbs)
+					obj = allocator.allocate<LimbNodeImpl>(scene, *iter.second.element);
 				else
-					obj = parse<NullImpl>(*scene, *iter.second.element, allocator);
+					obj = allocator.allocate<NullImpl>(scene, *iter.second.element);
 			}
 		}
-		else if (iter.second.element->id == "Texture")
+		else if (iter.second.element->id == "Texture" && !ignore_textures)
 		{
-			obj = parseTexture(*scene, *iter.second.element, allocator);
+			obj = parseTexture(scene, *iter.second.element, allocator);
 		}
-		else if (iter.second.element->id == "Video")
+		else if (iter.second.element->id == "Video" && !ignore_videos)
 		{
-			parseVideo(*scene, *iter.second.element, allocator);
+			parseVideo(scene, *iter.second.element, allocator);
 		}
-		else if (iter.second.element->id == "Pose")
+		else if (iter.second.element->id == "Pose" && !ignore_poses)
 		{
-			obj = parsePose(*scene, *iter.second.element, allocator);
+			obj = parsePose(scene, *iter.second.element, allocator);
 		}
 
 		if (obj.isError()) return false;
 
-		scene->m_object_map[iter.first].object = obj.getValue();
+		scene.m_object_map[iter.first].object = obj.getValue();
 		if (obj.getValue())
 		{
-			scene->m_all_objects.push_back(obj.getValue());
+			scene.m_all_objects.push_back(obj.getValue());
 			obj.getValue()->id = iter.first;
 		}
 	}
 
-	(*job_processor)([](void* ptr){
-		ParseGeometryJob* job = (ParseGeometryJob*)ptr;
-		job->is_error = parseGeometry(*job->element, job->triangulate, job->geom).isError();
-	}, job_user_ptr, &parse_geom_jobs[0], (u32)sizeof(parse_geom_jobs[0]), (u32)parse_geom_jobs.size());
+	if (!jobs.empty()) {
+		(*job_processor)([](void* ptr){
+			ParseDataJob* job = (ParseDataJob*)ptr;
+			job->error = !job->f(job->property, job->data);
+		}, job_user_ptr, &jobs[0], (u32)sizeof(jobs[0]), (u32)jobs.size());
 
-	for (const ParseGeometryJob& job : parse_geom_jobs) {
-		if (job.is_error) return false;
-		scene->m_object_map[job.id].object = job.geom;
-		if (job.geom) {
-			scene->m_all_objects.push_back(job.geom);
-			job.geom->id = job.id;
+		for (const ParseDataJob& job : jobs) {
+			if (job.error) {
+				Error::s_message = "Failed to parse data";
+				return false;
+			}
 		}
 	}
 
-	for (const Scene::Connection& con : scene->m_connections)
+	for (const Scene::Connection& con : scene.m_connections)
 	{
-		Object* parent = scene->m_object_map[con.to].object;
-		Object* child = scene->m_object_map[con.from].object;
+		if (con.type == Scene::Connection::PROPERTY_PROPERTY) continue;
+
+		Object* parent = scene.m_object_map[con.to_object].object;
+		Object* child = scene.m_object_map[con.from_object].object;
 		if (!child) continue;
 		if (!parent) continue;
 
@@ -3230,11 +3646,10 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 				{
 					AnimationCurveNodeImpl* node = (AnimationCurveNodeImpl*)child;
 					node->bone = parent;
-					node->bone_link_property = con.property;
+					node->bone_link_property = con.to_property;
 				}
 				break;
-			default:
-			break;
+			default: break;
 		}
 
 		switch (parent->getType())
@@ -3242,6 +3657,11 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 			case Object::Type::MESH:
 			{
 				MeshImpl* mesh = (MeshImpl*)parent;
+				if (child->getType() == Object::Type::SKIN)
+					mesh->skin = (Skin*)child;
+				else if (child->getType() == Object::Type::BLEND_SHAPE)
+					mesh->blendShape = (BlendShape*)child;
+
 				switch (child->getType())
 				{
 					case Object::Type::GEOMETRY:
@@ -3250,11 +3670,10 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 							Error::s_message = "Invalid mesh";
 							return false;
 						}
-						mesh->geometry = (Geometry*)child;
+						mesh->geometry = (GeometryImpl*)child;
 						break;
 					case Object::Type::MATERIAL: mesh->materials.push_back((Material*)child); break;
-					default:
-					break;
+					default: break;
 				}
 				break;
 			}
@@ -3306,20 +3725,20 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 				if (child->getType() == Object::Type::TEXTURE)
 				{
 					Texture::TextureType type = Texture::COUNT;
-					if (con.property == "NormalMap")
+					if (con.to_property == "NormalMap")
 						type = Texture::NORMAL;
-					else if (con.property == "DiffuseColor")
+					else if (con.to_property == "DiffuseColor")
 						type = Texture::DIFFUSE;
-					else if (con.property == "SpecularColor")
+					else if (con.to_property == "SpecularColor")
 						type = Texture::SPECULAR;
-                    else if (con.property == "ShininessExponent")
-                        type = Texture::SHININESS;
-                    else if (con.property == "EmissiveColor")
-                        type = Texture::EMISSIVE;
-                    else if (con.property == "AmbientColor")
-                        type = Texture::AMBIENT;
-                    else if (con.property == "ReflectionFactor")
-                        type = Texture::REFLECTION;
+					else if (con.to_property == "ShininessExponent")
+						type = Texture::SHININESS;
+					else if (con.to_property == "EmissiveColor")
+						type = Texture::EMISSIVE;
+					else if (con.to_property == "AmbientColor")
+						type = Texture::AMBIENT;
+					else if (con.to_property == "ReflectionFactor")
+						type = Texture::REFLECTION;
 					if (type == Texture::COUNT) break;
 
 					if (mat->textures[type])
@@ -3369,7 +3788,7 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 				if (child->getType() == Object::Type::ANIMATION_CURVE)
 				{
 					char tmp[32];
-					con.property.toString(tmp);
+					con.to_property.toString(tmp);
 					if (strcmp(tmp, "d|X") == 0)
 					{
 						node->curves[0].connection = &con;
@@ -3388,25 +3807,30 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 				}
 				break;
 			}
-			default:
-			break;
+			default: break;
 		}
 	}
 
+
 	if (!ignore_geometry) {
-		for (auto iter : scene->m_object_map)
+		struct PostprocessJob {
+			Object* obj;
+			bool error = false;
+			PostprocessJob(Object* o) : obj(o) {}
+		};
+		std::vector<PostprocessJob> postprocess_jobs;
+		for (auto iter : scene.m_object_map)
 		{
 			Object* obj = iter.second.object;
 			if (!obj) continue;
 			switch (obj->getType()) {
 				case Object::Type::CLUSTER:
-					if (!((ClusterImpl*)iter.second.object)->postprocess(scene->m_allocator)) {
-						Error::s_message = "Failed to postprocess cluster";
-						return false;
-					}
+				case Object::Type::GEOMETRY:
+				case Object::Type::MESH:
+					postprocess_jobs.push_back({obj});
 					break;
 				case Object::Type::BLEND_SHAPE_CHANNEL:
-					if (!((BlendShapeChannelImpl*)iter.second.object)->postprocess(scene->m_allocator)) {
+					if (!((BlendShapeChannelImpl*)iter.second.object)->postprocess(scene.m_allocator)) {
 						Error::s_message = "Failed to postprocess blend shape channel";
 						return false;
 					}
@@ -3417,8 +3841,23 @@ static bool parseObjects(const Element& root, Scene* scene, u64 flags, Allocator
 						return false;
 					}
 					break;
-				default:
-				break;
+				default: break;
+			}
+		}
+
+		if (!postprocess_jobs.empty()) {
+			(*job_processor)([](void* ptr){
+				PostprocessJob* job = (PostprocessJob*)ptr;
+				switch (job->obj->getType()) {
+					case Object::Type::CLUSTER: job->error = !((ClusterImpl*)job->obj)->postprocess(); break;
+					case Object::Type::GEOMETRY: job->error = !((GeometryImpl*)job->obj)->postprocess(); break;
+					case Object::Type::MESH: job->error = !((MeshImpl*)job->obj)->geometry_data.postprocess(); break;
+					default: break;
+				}
+			}, job_user_ptr, &postprocess_jobs[0], (u32)sizeof(postprocess_jobs[0]), (u32)postprocess_jobs.size());
+			for (const PostprocessJob& job : postprocess_jobs) if (job.error) {
+				Error::s_message = "Failed to postprocess object";
+				return false;
 			}
 		}
 	}
@@ -3434,76 +3873,76 @@ RotationOrder Object::getRotationOrder() const
 }
 
 
-Vec3 Object::getRotationOffset() const
+DVec3 Object::getRotationOffset() const
 {
 	return resolveVec3Property(*this, "RotationOffset", {0, 0, 0});
 }
 
 
-Vec3 Object::getRotationPivot() const
+DVec3 Object::getRotationPivot() const
 {
 	return resolveVec3Property(*this, "RotationPivot", {0, 0, 0});
 }
 
 
-Vec3 Object::getPostRotation() const
+DVec3 Object::getPostRotation() const
 {
 	return resolveVec3Property(*this, "PostRotation", {0, 0, 0});
 }
 
 
-Vec3 Object::getScalingOffset() const
+DVec3 Object::getScalingOffset() const
 {
 	return resolveVec3Property(*this, "ScalingOffset", {0, 0, 0});
 }
 
 
-Vec3 Object::getScalingPivot() const
+DVec3 Object::getScalingPivot() const
 {
 	return resolveVec3Property(*this, "ScalingPivot", {0, 0, 0});
 }
 
 
-Matrix Object::evalLocal(const Vec3& translation, const Vec3& rotation) const
+DMatrix Object::evalLocal(const DVec3& translation, const DVec3& rotation) const
 {
 	return evalLocal(translation, rotation, getLocalScaling());
 }
 
 
-Matrix Object::evalLocal(const Vec3& translation, const Vec3& rotation, const Vec3& scaling) const
+DMatrix Object::evalLocal(const DVec3& translation, const DVec3& rotation, const DVec3& scaling) const
 {
-	Vec3 rotation_pivot = getRotationPivot();
-	Vec3 scaling_pivot = getScalingPivot();
+	DVec3 rotation_pivot = getRotationPivot();
+	DVec3 scaling_pivot = getScalingPivot();
 	RotationOrder rotation_order = getRotationOrder();
 
-	Matrix s = makeIdentity();
+	DMatrix s = makeIdentity();
 	s.m[0] = scaling.x;
 	s.m[5] = scaling.y;
 	s.m[10] = scaling.z;
 
-	Matrix t = makeIdentity();
+	DMatrix t = makeIdentity();
 	setTranslation(translation, &t);
 
-	Matrix r = getRotationMatrix(rotation, rotation_order);
-	Matrix r_pre = getRotationMatrix(getPreRotation(), RotationOrder::EULER_XYZ);
-	Matrix r_post_inv = getRotationMatrix(-getPostRotation(), RotationOrder::EULER_ZYX);
+	DMatrix r = getRotationMatrix(rotation, rotation_order);
+	DMatrix r_pre = getRotationMatrix(getPreRotation(), RotationOrder::EULER_XYZ);
+	DMatrix r_post_inv = getRotationMatrix(-getPostRotation(), RotationOrder::EULER_ZYX);
 
-	Matrix r_off = makeIdentity();
+	DMatrix r_off = makeIdentity();
 	setTranslation(getRotationOffset(), &r_off);
 
-	Matrix r_p = makeIdentity();
+	DMatrix r_p = makeIdentity();
 	setTranslation(rotation_pivot, &r_p);
 
-	Matrix r_p_inv = makeIdentity();
+	DMatrix r_p_inv = makeIdentity();
 	setTranslation(-rotation_pivot, &r_p_inv);
 
-	Matrix s_off = makeIdentity();
+	DMatrix s_off = makeIdentity();
 	setTranslation(getScalingOffset(), &s_off);
 
-	Matrix s_p = makeIdentity();
+	DMatrix s_p = makeIdentity();
 	setTranslation(scaling_pivot, &s_p);
 
-	Matrix s_p_inv = makeIdentity();
+	DMatrix s_p_inv = makeIdentity();
 	setTranslation(-scaling_pivot, &s_p_inv);
 
 	// http://help.autodesk.com/view/FBX/2017/ENU/?guid=__files_GUID_10CDD63C_79C1_4F2D_BB28_AD2BE65A02ED_htm
@@ -3511,31 +3950,31 @@ Matrix Object::evalLocal(const Vec3& translation, const Vec3& rotation, const Ve
 }
 
 
-Vec3 Object::getLocalTranslation() const
+DVec3 Object::getLocalTranslation() const
 {
 	return resolveVec3Property(*this, "Lcl Translation", {0, 0, 0});
 }
 
 
-Vec3 Object::getPreRotation() const
+DVec3 Object::getPreRotation() const
 {
 	return resolveVec3Property(*this, "PreRotation", {0, 0, 0});
 }
 
 
-Vec3 Object::getLocalRotation() const
+DVec3 Object::getLocalRotation() const
 {
 	return resolveVec3Property(*this, "Lcl Rotation", {0, 0, 0});
 }
 
 
-Vec3 Object::getLocalScaling() const
+DVec3 Object::getLocalScaling() const
 {
 	return resolveVec3Property(*this, "Lcl Scaling", {1, 1, 1});
 }
 
 
-Matrix Object::getGlobalTransform() const
+DMatrix Object::getGlobalTransform() const
 {
 	const Object* parent = getParent();
 	if (!parent) return evalLocal(getLocalTranslation(), getLocalRotation());
@@ -3544,7 +3983,7 @@ Matrix Object::getGlobalTransform() const
 }
 
 
-Matrix Object::getLocalTransform() const
+DMatrix Object::getLocalTransform() const
 {
 	return evalLocal(getLocalTranslation(), getLocalRotation(), getLocalScaling());
 }
@@ -3552,12 +3991,13 @@ Matrix Object::getLocalTransform() const
 
 Object* Object::resolveObjectLinkReverse(Object::Type type) const
 {
-	u64 id = element.getFirstProperty() ? element.getFirstProperty()->getValue().toU64() : 0;
+	u64 id;
+	if (!toObjectID(scene, ((Element&)element).first_property, &id)) return nullptr;
 	for (auto& connection : scene.m_connections)
 	{
-		if (connection.from == id && connection.to != 0)
+		if (connection.from_object == id && connection.to_object != 0)
 		{
-			const Scene::ObjectPair& pair = scene.m_object_map.find(connection.to)->second;
+			const Scene::ObjectPair& pair = scene.m_object_map.find(connection.to_object)->second;
 			Object* obj = pair.object;
 			if (obj && obj->getType() == type) return obj;
 		}
@@ -3574,13 +4014,14 @@ const IScene& Object::getScene() const
 
 Object* Object::resolveObjectLink(int idx) const
 {
-	u64 id = element.getFirstProperty() ? element.getFirstProperty()->getValue().toU64() : 0;
+	u64 id = 0;
+	toObjectID(scene, ((Element&)element).first_property, &id);
 	for (auto& connection : scene.m_connections)
 	{
-		if (connection.to == id && connection.from != 0)
+		if (connection.to_object == id && connection.from_object != 0)
 		{
-			Object* obj = scene.m_object_map.find(connection.from)->second.object;
-			if (obj && obj->is_node && obj != this && connection.type == Scene::Connection::OBJECT_OBJECT)
+			Object* obj = scene.m_object_map.find(connection.from_object)->second.object;
+			if (obj)
 			{
 				if (idx == 0) return obj;
 				--idx;
@@ -3593,15 +4034,16 @@ Object* Object::resolveObjectLink(int idx) const
 
 Object* Object::resolveObjectLink(Object::Type type, const char* property, int idx) const
 {
-	u64 id = element.getFirstProperty() ? element.getFirstProperty()->getValue().toU64() : 0;
+	u64 id;
+	if (!toObjectID(scene, ((Element&)element).first_property, &id)) return nullptr;
 	for (auto& connection : scene.m_connections)
 	{
-		if (connection.to == id && connection.from != 0)
+		if (connection.to_object == id && connection.from_object != 0)
 		{
-			Object* obj = scene.m_object_map.find(connection.from)->second.object;
+			Object* obj = scene.m_object_map.find(connection.from_object)->second.object;
 			if (obj && obj->getType() == type)
 			{
-				if (property == nullptr || connection.property == property)
+				if (property == nullptr || connection.to_property == property)
 				{
 					if (idx == 0) return obj;
 					--idx;
@@ -3613,39 +4055,66 @@ Object* Object::resolveObjectLink(Object::Type type, const char* property, int i
 }
 
 
-Object* Object::getParent() const
-{
-	Object* parent = nullptr;
-	for (auto& connection : scene.m_connections)
-	{
-		if (connection.from == id)
-		{
-			Object* obj = scene.m_object_map.find(connection.to)->second.object;
-			if (obj && obj->is_node && obj != this && connection.type == Scene::Connection::OBJECT_OBJECT)
-			{
-				assert(parent == nullptr);
-				parent = obj;
+bool Scene::finalize() {
+	for (const Connection& connection : m_connections) {
+		if (connection.type != Connection::OBJECT_OBJECT) continue;
+		Object* to_obj = m_object_map.find(connection.to_object)->second.object;
+		Object* from_obj = m_object_map.find(connection.from_object)->second.object;
+		if (!from_obj) continue;
+		if (!to_obj) continue;
+		if (!to_obj->is_node) continue;
+		from_obj->parent = to_obj;
+	}
+
+	for (Object* object : m_all_objects) {
+		if (object->depth != 0xffFFffFF) continue;
+		if (object->parent == object) {
+			Error::s_message = "Cyclic node hierarchy";
+			return false;
+		}
+		if (!object->parent) {
+			object->depth = 0;
+			continue;
+		}
+
+		object->depth = 0;
+		
+		Object* parent = object->parent;
+		while (parent) {
+			if (parent == object) {
+				Error::s_message = "Cyclic node hierarchy";
+				return false;
 			}
+			++object->depth;
+			parent = parent->parent;
+		}
+
+		Object* p = object->parent;
+		Object* child = object;
+		while (p) {
+			p->depth = child->depth - 1;
+			child = p;
+			p = p->parent;
 		}
 	}
-	return parent;
+	return true;
 }
 
-
-IScene* load(const u8* data, int size, u64 flags, JobProcessor job_processor, void* job_user_ptr)
+IScene* load(const u8* data, usize size, u16 flags, JobProcessor job_processor, void* job_user_ptr)
 {
 	std::unique_ptr<Scene> scene(new Scene());
 	scene->m_data.resize(size);
 	memcpy(&scene->m_data[0], data, size);
-	u32 version;
-	
+
 	const bool is_binary = size >= 18 && strncmp((const char*)data, "Kaydara FBX Binary", 18) == 0;
 	OptionalError<Element*> root(nullptr);
 	if (is_binary) {
+		u32 version;
 		root = tokenize(&scene->m_data[0], size, version, scene->m_allocator);
-		if (version < 6200)
+		scene->version = version;
+		if (version < 6100)
 		{
-			Error::s_message = "Unsupported FBX file format version. Minimum supported version is 6.2";
+			Error::s_message = "Unsupported FBX file format version. Minimum supported version is 6.1";
 			return nullptr;
 		}
 		if (root.isError())
@@ -3657,17 +4126,25 @@ IScene* load(const u8* data, int size, u64 flags, JobProcessor job_processor, vo
 	else {
 		root = tokenizeText(&scene->m_data[0], size, scene->m_allocator);
 		if (root.isError()) return nullptr;
+		const ofbx::Element* header = findChild(*root.getValue(), "FBXHeaderExtension");
+		if (header) {
+			const ofbx::Element* version_elem = findChild(*header, "FBXVersion");
+			if (version_elem->first_property) {
+				scene->version = version_elem->first_property->getValue().toU32();
+			}
+		}
 	}
 
 	scene->m_root_element = root.getValue();
 	assert(scene->m_root_element);
 
 	// if (parseTemplates(*root.getValue()).isError()) return nullptr;
-	if (!parseConnections(*root.getValue(), scene.get())) return nullptr;
-	if (!parseTakes(scene.get())) return nullptr;
-	if (!parseObjects(*root.getValue(), scene.get(), flags, scene->m_allocator, job_processor, job_user_ptr)) return nullptr;
-	parseGlobalInfo(*root.getValue(), scene.get());
+	if (!parseConnections(*root.getValue(), *scene.get())) return nullptr;
+	if (!parseTakes(*scene.get())) return nullptr;
+	if (!parseObjects(*root.getValue(), *scene.get(), flags, scene->m_allocator, job_processor, job_user_ptr)) return nullptr;
+    parseGlobalInfo(*root.getValue(), scene.get());
 	parseGlobalSettings(*root.getValue(), scene.get());
+	if (!scene->finalize()) return nullptr;
 
 	return scene.release();
 }
diff --git a/Source/ThirdParty/OpenFBX/ofbx.h b/Source/ThirdParty/OpenFBX/ofbx.h
index 09ca328bb..bff12365d 100644
--- a/Source/ThirdParty/OpenFBX/ofbx.h
+++ b/Source/ThirdParty/OpenFBX/ofbx.h
@@ -8,11 +8,11 @@ namespace ofbx
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned int u32;
-#ifdef _WIN32
-	typedef long long i64;	
-	typedef unsigned long long u64;	
-#else	
-	typedef long i64;	
+#if defined(_WIN32) || defined(__ANDROID__)
+	typedef long long i64;
+	typedef unsigned long long u64;
+#else
+	typedef long i64;
 	typedef unsigned long u64;
 #endif
 
@@ -21,46 +21,70 @@ static_assert(sizeof(u32) == 4, "u32 is not 4 bytes");
 static_assert(sizeof(u64) == 8, "u64 is not 8 bytes");
 static_assert(sizeof(i64) == 8, "i64 is not 8 bytes");
 
+typedef decltype(sizeof(0)) usize;
 
 using JobFunction = void (*)(void*);
 using JobProcessor = void (*)(JobFunction, void*, void*, u32, u32);
 
-enum class LoadFlags : u64 {
-	TRIANGULATE = 1 << 0,
+// Ignoring certain nodes will only stop them from being processed not tokenised (i.e. they will still be in the tree)
+enum class LoadFlags : u16
+{
+	NONE = 0,
+	UNUSED = 1 << 0, // can be reused
 	IGNORE_GEOMETRY = 1 << 1,
 	IGNORE_BLEND_SHAPES = 1 << 2,
+	IGNORE_CAMERAS = 1 << 3,
+	IGNORE_LIGHTS = 1 << 4,
+	IGNORE_TEXTURES = 1 << 5,
+	IGNORE_SKIN = 1 << 6,
+	IGNORE_BONES = 1 << 7,
+	IGNORE_PIVOTS = 1 << 8,
+	IGNORE_ANIMATIONS = 1 << 9,
+	IGNORE_MATERIALS = 1 << 10,
+	IGNORE_POSES = 1 << 11,
+	IGNORE_VIDEOS = 1 << 12,
+	IGNORE_LIMBS = 1 << 13,
+	IGNORE_MESHES = 1 << 14,
+	IGNORE_MODELS = 1 << 15,
 };
 
-
-struct Vec2
+constexpr LoadFlags operator|(LoadFlags lhs, LoadFlags rhs)
 {
-	double x, y;
-};
+	return static_cast<LoadFlags>(static_cast<u16>(lhs) | static_cast<u16>(rhs));
+}
 
-
-struct Vec3
+inline LoadFlags& operator|=(LoadFlags& lhs, LoadFlags rhs)
 {
-	double x, y, z;
-};
+	return lhs = lhs | rhs;
+}
 
+struct DVec2 { double x, y; };
+struct DVec3 { double x, y, z; };
+struct DVec4 { double x, y, z, w; };
+struct DMatrix { double m[16]; /* last 4 are translation */ };
+struct DQuat{ double x, y, z, w; };
 
-struct Vec4
-{
-	double x, y, z, w;
-};
-
-
-struct Matrix
-{
-	double m[16]; // last 4 are translation
-};
-
-
-struct Quat
-{
-	double x, y, z, w;
-};
+struct FVec2 { float x, y; };
+struct FVec3 { float x, y, z; };
+struct FVec4 { float x, y, z, w; };
+struct FMatrix { float m[16]; };
+struct FQuat{ float x, y, z, w; };
 
+#ifndef OFBX_DOUBLE_PRECISION
+	// use floats for vertices, normals, uvs, ...
+	using Vec2 = FVec2;
+	using Vec3 = FVec3;
+	using Vec4 = FVec4;
+	using Matrix = FMatrix;
+	using Quat = FQuat;
+#else
+	// use doubles for vertices, normals, uvs, ...
+	using Vec2 = DVec2;
+	using Vec3 = DVec3;
+	using Vec4 = DVec4;
+	using Matrix = DMatrix;
+	using Quat = DQuat;
+#endif
 
 struct Color
 {
@@ -81,9 +105,10 @@ struct DataView
 	i64 toI64() const;
 	int toInt() const;
 	u32 toU32() const;
+	bool toBool() const;
 	double toDouble() const;
 	float toFloat() const;
-	
+
 	template <int N>
 	void toString(char(&out)[N]) const
 	{
@@ -113,7 +138,8 @@ struct IElementProperty
 		ARRAY_INT = 'i',
 		ARRAY_LONG = 'l',
 		ARRAY_FLOAT = 'f',
-		BINARY = 'R'
+		BINARY = 'R',
+		NONE = ' '
 	};
 	virtual ~IElementProperty() {}
 	virtual Type getType() const = 0;
@@ -168,6 +194,8 @@ struct Object
 		TEXTURE,
 		LIMB_NODE,
 		NULL_NODE,
+		CAMERA,
+		LIGHT,
 		NODE_ATTRIBUTE,
 		CLUSTER,
 		SKIN,
@@ -189,22 +217,22 @@ struct Object
 	Object* resolveObjectLink(int idx) const;
 	Object* resolveObjectLink(Type type, const char* property, int idx) const;
 	Object* resolveObjectLinkReverse(Type type) const;
-	Object* getParent() const;
+	Object* getParent() const { return parent; }
 
 	RotationOrder getRotationOrder() const;
-	Vec3 getRotationOffset() const;
-	Vec3 getRotationPivot() const;
-	Vec3 getPostRotation() const;
-	Vec3 getScalingOffset() const;
-	Vec3 getScalingPivot() const;
-	Vec3 getPreRotation() const;
-	Vec3 getLocalTranslation() const;
-	Vec3 getLocalRotation() const;
-	Vec3 getLocalScaling() const;
-	Matrix getGlobalTransform() const;
-	Matrix getLocalTransform() const;
-	Matrix evalLocal(const Vec3& translation, const Vec3& rotation) const;
-	Matrix evalLocal(const Vec3& translation, const Vec3& rotation, const Vec3& scaling) const;
+	DVec3 getRotationOffset() const;
+	DVec3 getRotationPivot() const;
+	DVec3 getPostRotation() const;
+	DVec3 getScalingOffset() const;
+	DVec3 getScalingPivot() const;
+	DVec3 getPreRotation() const;
+	DVec3 getLocalTranslation() const;
+	DVec3 getLocalRotation() const;
+	DVec3 getLocalScaling() const;
+	DMatrix getGlobalTransform() const;
+	DMatrix getLocalTransform() const;
+	DMatrix evalLocal(const DVec3& translation, const DVec3& rotation) const;
+	DMatrix evalLocal(const DVec3& translation, const DVec3& rotation, const DVec3& scaling) const;
 	bool isNode() const { return is_node; }
 
 
@@ -214,11 +242,14 @@ struct Object
 	}
 
 	u64 id;
+	u32 depth = 0xffFFffFF;
+	Object* parent = nullptr;
 	char name[128];
 	const IElement& element;
 	const Object* node_attribute;
 
 protected:
+	friend struct Scene;
 	bool is_node;
 	const Scene& scene;
 };
@@ -228,7 +259,7 @@ struct Pose : Object {
 	static const Type s_type = Type::POSE;
 	Pose(const Scene& _scene, const IElement& _element);
 
-	virtual Matrix getMatrix() const = 0;
+	virtual DMatrix getMatrix() const = 0;
 	virtual const Object* getNode() const = 0;
 };
 
@@ -255,6 +286,124 @@ struct Texture : Object
 	virtual DataView getEmbeddedData() const = 0;
 };
 
+struct Light : Object
+{
+public:
+	enum class LightType
+	{
+		POINT,
+		DIRECTIONAL,
+		SPOT,
+		AREA,
+		VOLUME,
+		COUNT
+	};
+
+	enum class DecayType
+	{
+		NO_DECAY,
+		LINEAR,
+		QUADRATIC,
+		CUBIC,
+		COUNT
+	};
+
+	Light(const Scene& _scene, const IElement& _element)
+		: Object(_scene, _element)
+	{
+		// Initialize the light properties here
+	}
+
+	// Light type
+	virtual LightType getLightType() const = 0;
+
+	// Light properties
+	virtual bool doesCastLight() const = 0;
+	virtual bool doesDrawVolumetricLight() const = 0;
+	virtual bool doesDrawGroundProjection() const = 0;
+	virtual bool doesDrawFrontFacingVolumetricLight() const = 0;
+	virtual Color getColor() const = 0;
+	virtual double getIntensity() const = 0;
+	virtual double getInnerAngle() const = 0;
+	virtual double getOuterAngle() const = 0;
+	virtual double getFog() const = 0;
+	virtual DecayType getDecayType() const = 0;
+	virtual double getDecayStart() const = 0;
+
+	// Near attenuation
+	virtual bool doesEnableNearAttenuation() const = 0;
+	virtual double getNearAttenuationStart() const = 0;
+	virtual double getNearAttenuationEnd() const = 0;
+
+	// Far attenuation
+	virtual bool doesEnableFarAttenuation() const = 0;
+	virtual double getFarAttenuationStart() const = 0;
+	virtual double getFarAttenuationEnd() const = 0;
+
+	// Shadows
+	virtual const Texture* getShadowTexture() const = 0;
+	virtual bool doesCastShadows() const = 0;
+	virtual Color getShadowColor() const = 0;
+};
+
+struct Camera : Object
+{
+	enum class ProjectionType
+	{
+		PERSPECTIVE,
+		ORTHOGRAPHIC,
+		COUNT
+	};
+
+	enum class ApertureMode // Used to determine how to calculate the FOV
+	{
+		HORIZANDVERT,
+		HORIZONTAL,
+		VERTICAL,
+		FOCALLENGTH,
+		COUNT
+	};
+
+	enum class GateFit
+	{
+		NONE,
+		VERTICAL,
+		HORIZONTAL,
+		FILL,
+		OVERSCAN,
+		STRETCH,
+		COUNT
+	};
+
+	static const Type s_type = Type::CAMERA;
+
+	Camera(const Scene& _scene, const IElement& _element)
+		: Object(_scene, _element)
+	{
+	}
+
+	virtual Type getType() const { return Type::CAMERA; }
+	virtual ProjectionType getProjectionType() const = 0;
+	virtual ApertureMode getApertureMode() const = 0;
+
+	virtual double getFilmHeight() const = 0;
+	virtual double getFilmWidth() const = 0;
+
+	virtual double getAspectHeight() const = 0;
+	virtual double getAspectWidth() const = 0;
+
+	virtual double getNearPlane() const = 0;
+	virtual double getFarPlane() const = 0;
+	virtual bool doesAutoComputeClipPanes() const = 0;
+
+	virtual GateFit getGateFit() const = 0;
+	virtual double getFilmAspectRatio() const = 0;
+	virtual double getFocalLength() const = 0;
+	virtual double getFocusDistance() const = 0;
+
+	virtual DVec3 getBackgroundColor() const = 0;
+	virtual DVec3 getInterestPosition() const = 0;
+};
 
 struct Material : Object
 {
@@ -291,8 +440,8 @@ struct Cluster : Object
 	virtual int getIndicesCount() const = 0;
 	virtual const double* getWeights() const = 0;
 	virtual int getWeightsCount() const = 0;
-	virtual Matrix getTransformMatrix() const = 0;
-	virtual Matrix getTransformLinkMatrix() const = 0;
+	virtual DMatrix getTransformMatrix() const = 0;
+	virtual DMatrix getTransformLinkMatrix() const = 0;
 	virtual const Object* getLink() const = 0;
 };
 
@@ -341,26 +490,74 @@ struct NodeAttribute : Object
 };
 
 
-struct Geometry : Object
-{
+struct Vec2Attributes {
+	const Vec2* values = nullptr;
+	const int* indices = nullptr;
+	int count = 0;
+
+	Vec2Attributes() {}
+	Vec2Attributes(const Vec2* v, const int* i, int c) : values(v), indices(i), count(c) {}
+
+	Vec2 get(int i) const { return indices ? values[indices[i]] : values[i]; }
+};
+
+struct Vec3Attributes {
+	const Vec3* values = nullptr;
+	const int* indices = nullptr;
+	int count = 0;
+	int values_count = 0;
+
+	Vec3Attributes() {}
+	Vec3Attributes(const Vec3* v, const int* i, int c, int vc) : values(v), indices(i), count(c), values_count(vc) {}
+
+	Vec3 get(int i) const { return indices ? values[indices[i]] : values[i]; }
+};
+
+struct Vec4Attributes {
+	const Vec4* values = nullptr;
+	const int* indices = nullptr;
+	int count = 0;
+
+	Vec4Attributes() {}
+	Vec4Attributes(const Vec4* v, const int* i, int c) : values(v), indices(i), count(c) {}
+
+	Vec4 get(int i) const { return indices ? values[indices[i]] : values[i]; }
+};
+
+// subset of polygons with same material
+struct GeometryPartition {
+	struct Polygon {
+		const int from_vertex; // index into VecNAttributes::indices
+		const int vertex_count;
+	};
+	const Polygon* polygons;
+	const int polygon_count;
+	const int max_polygon_triangles; // max triangles in single polygon, can be used for preallocation
+	const int triangles_count; // number of triangles after polygon triangulation, can be used for preallocation
+};
+
+struct GeometryData {
+	virtual ~GeometryData() {}
+	
+	virtual Vec3Attributes getPositions() const = 0;
+	virtual Vec3Attributes getNormals() const = 0;
+	virtual Vec2Attributes getUVs(int index = 0) const = 0;
+	virtual Vec4Attributes getColors() const = 0;
+	virtual Vec3Attributes getTangents() const = 0;
+	virtual int getPartitionCount() const = 0;
+	virtual GeometryPartition getPartition(int partition_index) const = 0;
+};
+
+
+struct Geometry : Object {
 	static const Type s_type = Type::GEOMETRY;
 	static const int s_uvs_max = 4;
 
 	Geometry(const Scene& _scene, const IElement& _element);
 
-	virtual const Vec3* getVertices() const = 0;
-	virtual int getVertexCount() const = 0;
-
-	virtual const int* getFaceIndices() const = 0;
-	virtual int getIndexCount() const = 0;
-
-	virtual const Vec3* getNormals() const = 0;
-	virtual const Vec2* getUVs(int index = 0) const = 0;
-	virtual const Vec4* getColors() const = 0;
-	virtual const Vec3* getTangents() const = 0;
+	virtual const GeometryData& getGeometryData() const = 0;
 	virtual const Skin* getSkin() const = 0;
 	virtual const BlendShape* getBlendShape() const = 0;
-	virtual const int* getMaterials() const = 0;
 };
 
 
@@ -373,21 +570,27 @@ struct Shape : Object
 	virtual const Vec3* getVertices() const = 0;
 	virtual int getVertexCount() const = 0;
 
+	virtual const int* getIndices() const = 0;
+	virtual int getIndexCount() const = 0;
+
 	virtual const Vec3* getNormals() const = 0;
 };
 
 
-struct Mesh : Object
-{
+struct Mesh : Object {
 	static const Type s_type = Type::MESH;
 
 	Mesh(const Scene& _scene, const IElement& _element);
 
 	virtual const Pose* getPose() const = 0;
 	virtual const Geometry* getGeometry() const = 0;
-	virtual Matrix getGeometricMatrix() const = 0;
+	virtual DMatrix getGeometricMatrix() const = 0;
 	virtual const Material* getMaterial(int idx) const = 0;
 	virtual int getMaterialCount() const = 0;
+	// this will use data from `Geometry` if available and from `Mesh` otherwise
+	virtual const GeometryData& getGeometryData() const = 0;
+	virtual const Skin* getSkin() const = 0;
+	virtual const BlendShape* getBlendShape() const = 0;
 };
 
 
@@ -429,8 +632,9 @@ struct AnimationCurveNode : Object
 
 	AnimationCurveNode(const Scene& _scene, const IElement& _element);
 
+	virtual DataView getBoneLinkProperty() const = 0;
 	virtual const AnimationCurve* getCurve(int idx) const = 0; 
-	virtual Vec3 getNodeLocalTransform(double time) const = 0;
+	virtual DVec3 getNodeLocalTransform(double time) const = 0;
 	virtual const Object* getBone() const = 0;
 };
 
@@ -494,9 +698,11 @@ enum FrameRate
 
 struct GlobalSettings
 {
-	UpVector UpAxis = UpVector_AxisX;
+	UpVector UpAxis = UpVector_AxisY;
 	int UpAxisSign = 1;
-	FrontVector FrontAxis = FrontVector_ParityOdd;
+	// this seems to be 1-2 in Autodesk (odd/even parity), and 0-2 in Blender (axis as in UpAxis)
+	// I recommend to ignore FrontAxis and use just UpVector
+	int FrontAxis = 1; 
 	int FrontAxisSign = 1;
 	CoordSystem CoordAxis = CoordSystem_RightHanded;
 	int CoordAxisSign = 1;
@@ -522,30 +728,102 @@ struct GlobalInfo
 struct IScene
 {
 	virtual void destroy() = 0;
+
+	// Root Node
 	virtual const IElement* getRootElement() const = 0;
 	virtual const Object* getRoot() const = 0;
-	virtual const TakeInfo* getTakeInfo(const char* name) const = 0;
+
+	// Meshes
 	virtual int getMeshCount() const = 0;
-	virtual float getSceneFrameRate() const = 0;
-	virtual const GlobalInfo* getGlobalInfo() const = 0;
-	virtual const GlobalSettings* getGlobalSettings() const = 0;
 	virtual const Mesh* getMesh(int index) const = 0;
+
+	// Geometry
+	virtual int getGeometryCount() const = 0;
+	virtual const Geometry* getGeometry(int index) const = 0;
+
+	// Animations
 	virtual int getAnimationStackCount() const = 0;
 	virtual const AnimationStack* getAnimationStack(int index) const = 0;
+
+	// Cameras
+	virtual int getCameraCount() const = 0;
+	virtual const Camera* getCamera(int index) const = 0;
+
+	// Lights
+	virtual int getLightCount() const = 0;
+	virtual const Light* getLight(int index) const = 0;
+
+	// Scene Objects (Everything in scene)
 	virtual const Object* const* getAllObjects() const = 0;
 	virtual int getAllObjectCount() const = 0;
+
+	// Embedded files/Data
 	virtual int getEmbeddedDataCount() const = 0;
 	virtual DataView getEmbeddedData(int index) const = 0;
 	virtual DataView getEmbeddedFilename(int index) const = 0;
+	virtual bool isEmbeddedBase64(int index) const = 0;
+	// data are encoded in returned property and all ->next properties
+	virtual const IElementProperty* getEmbeddedBase64Data(int index) const = 0;
+
+	// Scene Misc
+	virtual const TakeInfo* getTakeInfo(const char* name) const = 0;
+	virtual float getSceneFrameRate() const = 0;
+	virtual const GlobalSettings* getGlobalSettings() const = 0;
+	virtual const GlobalInfo* getGlobalInfo() const = 0;
 
 	virtual ~IScene() {}
 };
 
 
-IScene* load(const u8* data, int size, u64 flags, JobProcessor job_processor = nullptr, void* job_user_ptr = nullptr);
+IScene* load(const u8* data, usize size, u16 flags, JobProcessor job_processor = nullptr, void* job_user_ptr = nullptr);
 const char* getError();
 double fbxTimeToSeconds(i64 value);
 i64 secondsToFbxTime(double value);
 
+// TODO nonconvex
+inline u32 triangulate(const GeometryData& geom, const GeometryPartition::Polygon& polygon, int* tri_indices) {
+	if (polygon.vertex_count < 3) return 0;
+	if (polygon.vertex_count == 3) {
+		tri_indices[0] = polygon.from_vertex;
+		tri_indices[1] = polygon.from_vertex + 1;
+		tri_indices[2] = polygon.from_vertex + 2;
+		return 3;
+	}
+	else if (polygon.vertex_count == 4) {
+		tri_indices[0] = polygon.from_vertex + 0;
+		tri_indices[1] = polygon.from_vertex + 1;
+		tri_indices[2] = polygon.from_vertex + 2;
+
+		tri_indices[3] = polygon.from_vertex + 0;
+		tri_indices[4] = polygon.from_vertex + 2;
+		tri_indices[5] = polygon.from_vertex + 3;
+		return 6;
+	}
+	
+	for (int tri = 0; tri < polygon.vertex_count - 2; ++tri) {
+		tri_indices[tri * 3 + 0] = polygon.from_vertex;
+		tri_indices[tri * 3 + 1] = polygon.from_vertex + 1 + tri;
+		tri_indices[tri * 3 + 2] = polygon.from_vertex + 2 + tri;
+	}
+	return 3 * (polygon.vertex_count - 2);
+}
+
 
 } // namespace ofbx
+
+#ifdef OFBX_DEFAULT_DELETER
+#include <memory>
+
+template <> struct ::std::default_delete<ofbx::IScene>
+{
+	default_delete() = default;
+	template <class U> constexpr default_delete(default_delete<U>) noexcept {}
+	void operator()(ofbx::IScene* scene) const noexcept
+	{
+		if (scene)
+		{
+			scene->destroy();
+		}
+	}
+};
+#endif
diff --git a/Source/ThirdParty/OpenFBX/miniz.cpp b/Source/ThirdParty/tinyexr/miniz.cpp
similarity index 84%
rename from Source/ThirdParty/OpenFBX/miniz.cpp
rename to Source/ThirdParty/tinyexr/miniz.cpp
index 98d58a77c..c9c059d9a 100644
--- a/Source/ThirdParty/OpenFBX/miniz.cpp
+++ b/Source/ThirdParty/tinyexr/miniz.cpp
@@ -1,3 +1,4 @@
+#include "miniz.h"
 /**************************************************************************
  *
  * Copyright 2013-2014 RAD Game Tools and Valve Software
@@ -24,7 +25,7 @@
  *
  **************************************************************************/
 
-#include  "miniz.h"
+
 
 typedef unsigned char mz_validate_uint16[sizeof(mz_uint16) == 2 ? 1 : -1];
 typedef unsigned char mz_validate_uint32[sizeof(mz_uint32) == 4 ? 1 : -1];
@@ -82,6 +83,12 @@ mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len)
         }
         return ~crcu32;
     }
+#elif defined(USE_EXTERNAL_MZCRC)
+/* If USE_EXTERNAL_CRC is defined, an external module will export the
+ * mz_crc32() symbol for us to use, e.g. an SSE-accelerated version.
+ * Depending on the impl, it may be necessary to ~ the input/output crc values.
+ */
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len);
 #else
 /* Faster, but larger CPU cache footprint.
  */
@@ -89,43 +96,43 @@ mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
 {
     static const mz_uint32 s_crc_table[256] =
         {
-            0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535,
-            0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD,
-            0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D,
-            0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
-            0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4,
-            0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
-            0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC,
-            0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
-            0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB,
-            0xB6662D3D, 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F,
-            0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB,
-            0x086D3D2D, 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
-            0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA,
-            0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 0x4DB26158, 0x3AB551CE,
-            0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A,
-            0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
-            0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409,
-            0xCE61E49F, 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
-            0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739,
-            0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
-            0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344, 0x8708A3D2, 0x1E01F268,
-            0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0,
-            0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8,
-            0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-            0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF,
-            0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703,
-            0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7,
-            0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
-            0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE,
-            0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
-            0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 0x88085AE6,
-            0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
-            0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D,
-            0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5,
-            0x47B2CF7F, 0x30B5FFE9, 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605,
-            0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
-            0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+          0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535,
+          0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD,
+          0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D,
+          0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+          0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4,
+          0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+          0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC,
+          0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+          0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB,
+          0xB6662D3D, 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F,
+          0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB,
+          0x086D3D2D, 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+          0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA,
+          0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 0x4DB26158, 0x3AB551CE,
+          0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A,
+          0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+          0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409,
+          0xCE61E49F, 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+          0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739,
+          0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+          0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344, 0x8708A3D2, 0x1E01F268,
+          0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0,
+          0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8,
+          0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+          0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF,
+          0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703,
+          0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7,
+          0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+          0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE,
+          0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+          0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 0x88085AE6,
+          0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+          0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D,
+          0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5,
+          0x47B2CF7F, 0x30B5FFE9, 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605,
+          0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+          0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
         };
 
     mz_uint32 crc32 = (mz_uint32)crc ^ 0xFFFFFFFF;
@@ -157,17 +164,17 @@ void mz_free(void *p)
     MZ_FREE(p);
 }
 
-void *miniz_def_alloc_func(void *opaque, size_t items, size_t size)
+MINIZ_EXPORT void *miniz_def_alloc_func(void *opaque, size_t items, size_t size)
 {
     (void)opaque, (void)items, (void)size;
     return MZ_MALLOC(items * size);
 }
-void miniz_def_free_func(void *opaque, void *address)
+MINIZ_EXPORT void miniz_def_free_func(void *opaque, void *address)
 {
     (void)opaque, (void)address;
     MZ_FREE(address);
 }
-void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size)
+MINIZ_EXPORT void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size)
 {
     (void)opaque, (void)address, (void)items, (void)size;
     return MZ_REALLOC(address, items * size);
@@ -180,6 +187,8 @@ const char *mz_version(void)
 
 #ifndef MINIZ_NO_ZLIB_APIS
 
+#ifndef MINIZ_NO_DEFLATE_APIS
+
 int mz_deflateInit(mz_streamp pStream, int level)
 {
     return mz_deflateInit2(pStream, level, MZ_DEFLATED, MZ_DEFAULT_WINDOW_BITS, 9, MZ_DEFAULT_STRATEGY);
@@ -314,13 +323,13 @@ int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char
     memset(&stream, 0, sizeof(stream));
 
     /* In case mz_ulong is 64-bits (argh I hate longs). */
-    if ((source_len | *pDest_len) > 0xFFFFFFFFU)
+    if ((mz_uint64)(source_len | *pDest_len) > 0xFFFFFFFFU)
         return MZ_PARAM_ERROR;
 
     stream.next_in = pSource;
     stream.avail_in = (mz_uint32)source_len;
     stream.next_out = pDest;
-    stream.avail_out = (mz_uint32) * pDest_len;
+    stream.avail_out = (mz_uint32)*pDest_len;
 
     status = mz_deflateInit(&stream, level);
     if (status != MZ_OK)
@@ -347,6 +356,10 @@ mz_ulong mz_compressBound(mz_ulong source_len)
     return mz_deflateBound(NULL, source_len);
 }
 
+#endif /*#ifndef MINIZ_NO_DEFLATE_APIS*/
+
+#ifndef MINIZ_NO_INFLATE_APIS
+
 typedef struct
 {
     tinfl_decompressor m_decomp;
@@ -397,6 +410,32 @@ int mz_inflateInit(mz_streamp pStream)
     return mz_inflateInit2(pStream, MZ_DEFAULT_WINDOW_BITS);
 }
 
+int mz_inflateReset(mz_streamp pStream)
+{
+    inflate_state *pDecomp;
+    if (!pStream)
+        return MZ_STREAM_ERROR;
+
+    pStream->data_type = 0;
+    pStream->adler = 0;
+    pStream->msg = NULL;
+    pStream->total_in = 0;
+    pStream->total_out = 0;
+    pStream->reserved = 0;
+
+    pDecomp = (inflate_state *)pStream->state;
+
+    tinfl_init(&pDecomp->m_decomp);
+    pDecomp->m_dict_ofs = 0;
+    pDecomp->m_dict_avail = 0;
+    pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+    pDecomp->m_first_call = 1;
+    pDecomp->m_has_flushed = 0;
+    /* pDecomp->m_window_bits = window_bits */;
+
+    return MZ_OK;
+}
+
 int mz_inflate(mz_streamp pStream, int flush)
 {
     inflate_state *pState;
@@ -520,27 +559,27 @@ int mz_inflateEnd(mz_streamp pStream)
     }
     return MZ_OK;
 }
-
-int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+int mz_uncompress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong *pSource_len)
 {
     mz_stream stream;
     int status;
     memset(&stream, 0, sizeof(stream));
 
     /* In case mz_ulong is 64-bits (argh I hate longs). */
-    if ((source_len | *pDest_len) > 0xFFFFFFFFU)
+    if ((mz_uint64)(*pSource_len | *pDest_len) > 0xFFFFFFFFU)
         return MZ_PARAM_ERROR;
 
     stream.next_in = pSource;
-    stream.avail_in = (mz_uint32)source_len;
+    stream.avail_in = (mz_uint32)*pSource_len;
     stream.next_out = pDest;
-    stream.avail_out = (mz_uint32) * pDest_len;
+    stream.avail_out = (mz_uint32)*pDest_len;
 
     status = mz_inflateInit(&stream);
     if (status != MZ_OK)
         return status;
 
     status = mz_inflate(&stream, MZ_FINISH);
+    *pSource_len = *pSource_len - stream.avail_in;
     if (status != MZ_STREAM_END)
     {
         mz_inflateEnd(&stream);
@@ -551,6 +590,13 @@ int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char
     return mz_inflateEnd(&stream);
 }
 
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+    return mz_uncompress2(pDest, pDest_len, pSource, &source_len);
+}
+
+#endif /*#ifndef MINIZ_NO_INFLATE_APIS*/
+
 const char *mz_error(int err)
 {
     static struct
@@ -558,10 +604,9 @@ const char *mz_error(int err)
         int m_err;
         const char *m_pDesc;
     } s_error_descs[] =
-          {
-              { MZ_OK, "" }, { MZ_STREAM_END, "stream end" }, { MZ_NEED_DICT, "need dictionary" }, { MZ_ERRNO, "file error" }, { MZ_STREAM_ERROR, "stream error" },
-              { MZ_DATA_ERROR, "data error" }, { MZ_MEM_ERROR, "out of memory" }, { MZ_BUF_ERROR, "buf error" }, { MZ_VERSION_ERROR, "version error" }, { MZ_PARAM_ERROR, "parameter error" }
-          };
+        {
+          { MZ_OK, "" }, { MZ_STREAM_END, "stream end" }, { MZ_NEED_DICT, "need dictionary" }, { MZ_ERRNO, "file error" }, { MZ_STREAM_ERROR, "stream error" }, { MZ_DATA_ERROR, "data error" }, { MZ_MEM_ERROR, "out of memory" }, { MZ_BUF_ERROR, "buf error" }, { MZ_VERSION_ERROR, "version error" }, { MZ_PARAM_ERROR, "parameter error" }
+        };
     mz_uint i;
     for (i = 0; i < sizeof(s_error_descs) / sizeof(s_error_descs[0]); ++i)
         if (s_error_descs[i].m_err == err)
@@ -629,6 +674,7 @@ const char *mz_error(int err)
 
 
 
+#ifndef MINIZ_NO_DEFLATE_APIS
 
 #ifdef __cplusplus
 extern "C" {
@@ -639,64 +685,64 @@ extern "C" {
 /* Purposely making these tables static for faster init and thread safety. */
 static const mz_uint16 s_tdefl_len_sym[256] =
     {
-        257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, 268, 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, 272, 272,
-        273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, 274, 274, 274, 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276,
-        277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
-        279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280,
-        281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
-        282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
-        283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
-        284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 285
+      257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, 268, 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, 272, 272,
+      273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, 274, 274, 274, 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276,
+      277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
+      279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280,
+      281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
+      282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
+      283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
+      284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 285
     };
 
 static const mz_uint8 s_tdefl_len_extra[256] =
     {
-        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0
     };
 
 static const mz_uint8 s_tdefl_small_dist_sym[512] =
     {
-        0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
-        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
-        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-        14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-        14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
-        17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
-        17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
-        17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17
+      0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+      11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17
     };
 
 static const mz_uint8 s_tdefl_small_dist_extra[512] =
     {
-        0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7, 7, 7
+      0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7
     };
 
 static const mz_uint8 s_tdefl_large_dist_sym[128] =
     {
-        0, 0, 18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-        26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-        28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
+      0, 0, 18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
     };
 
 static const mz_uint8 s_tdefl_large_dist_extra[128] =
     {
-        0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+      0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
     };
 
 /* Radix sorts tdefl_sym_freq[] array by 16-bit key m_key. Returns ptr to sorted values. */
@@ -708,7 +754,7 @@ static tdefl_sym_freq *tdefl_radix_sort_syms(mz_uint num_syms, tdefl_sym_freq *p
 {
     mz_uint32 total_passes = 2, pass_shift, pass, i, hist[256 * 2];
     tdefl_sym_freq *pCur_syms = pSyms0, *pNew_syms = pSyms1;
-    MZ_CLEAR_OBJ(hist);
+    MZ_CLEAR_ARR(hist);
     for (i = 0; i < num_syms; i++)
     {
         mz_uint freq = pSyms0[i].m_key;
@@ -826,7 +872,7 @@ static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int
 {
     int i, j, l, num_codes[1 + TDEFL_MAX_SUPPORTED_HUFF_CODESIZE];
     mz_uint next_code[TDEFL_MAX_SUPPORTED_HUFF_CODESIZE + 1];
-    MZ_CLEAR_OBJ(num_codes);
+    MZ_CLEAR_ARR(num_codes);
     if (static_table)
     {
         for (i = 0; i < table_len; i++)
@@ -852,8 +898,8 @@ static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int
 
         tdefl_huffman_enforce_max_code_size(num_codes, num_used_syms, code_size_limit);
 
-        MZ_CLEAR_OBJ(d->m_huff_code_sizes[table_num]);
-        MZ_CLEAR_OBJ(d->m_huff_codes[table_num]);
+        MZ_CLEAR_ARR(d->m_huff_code_sizes[table_num]);
+        MZ_CLEAR_ARR(d->m_huff_codes[table_num]);
         for (i = 1, j = num_used_syms; i <= code_size_limit; i++)
             for (l = num_codes[i]; l > 0; l--)
                 d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (mz_uint8)(i);
@@ -939,7 +985,7 @@ static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int
         }                                                                                  \
     }
 
-static mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+static const mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
 
 static void tdefl_start_dynamic_block(tdefl_compressor *d)
 {
@@ -1052,7 +1098,7 @@ static void tdefl_start_static_block(tdefl_compressor *d)
 
 static const mz_uint mz_bitmasks[17] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF };
 
-#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES &&MINIZ_LITTLE_ENDIAN &&MINIZ_HAS_64BIT_REGISTERS
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
 static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
 {
     mz_uint flags;
@@ -1077,7 +1123,8 @@ static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
         if (flags & 1)
         {
             mz_uint s0, s1, n0, n1, sym, num_extra_bits;
-            mz_uint match_len = pLZ_codes[0], match_dist = *(const mz_uint16 *)(pLZ_codes + 1);
+            mz_uint match_len = pLZ_codes[0];
+            mz_uint match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8));
             pLZ_codes += 3;
 
             MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
@@ -1122,7 +1169,7 @@ static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
         if (pOutput_buf >= d->m_pOutput_buf_end)
             return MZ_FALSE;
 
-        *(mz_uint64 *)pOutput_buf = bit_buffer;
+        memcpy(pOutput_buf, &bit_buffer, sizeof(mz_uint64));
         pOutput_buf += (bits_in >> 3);
         bit_buffer >>= (bits_in & ~7);
         bits_in &= 7;
@@ -1204,6 +1251,8 @@ static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_block)
     return tdefl_compress_lz_codes(d);
 }
 
+static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500 };
+
 static int tdefl_flush_block(tdefl_compressor *d, int flush)
 {
     mz_uint saved_bit_buf, saved_bits_in;
@@ -1224,8 +1273,27 @@ static int tdefl_flush_block(tdefl_compressor *d, int flush)
 
     if ((d->m_flags & TDEFL_WRITE_ZLIB_HEADER) && (!d->m_block_index))
     {
-        TDEFL_PUT_BITS(0x78, 8);
-        TDEFL_PUT_BITS(0x01, 8);
+        const mz_uint8 cmf = 0x78;
+        mz_uint8 flg, flevel = 3;
+        mz_uint header, i, mz_un = sizeof(s_tdefl_num_probes) / sizeof(mz_uint);
+
+        /* Determine compression level by reversing the process in tdefl_create_comp_flags_from_zip_params() */
+        for (i = 0; i < mz_un; i++)
+            if (s_tdefl_num_probes[i] == (d->m_flags & 0xFFF)) break;
+
+        if (i < 2)
+            flevel = 0;
+        else if (i < 6)
+            flevel = 1;
+        else if (i == 6)
+            flevel = 2;
+
+        header = cmf << 8 | (flevel << 6);
+        header += 31 - (header % 31);
+        flg = header & 0xFF;
+
+        TDEFL_PUT_BITS(cmf, 8);
+        TDEFL_PUT_BITS(flg, 8);
     }
 
     TDEFL_PUT_BITS(flush == TDEFL_FINISH, 1);
@@ -1340,13 +1408,29 @@ static int tdefl_flush_block(tdefl_compressor *d, int flush)
 }
 
 #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+static mz_uint16 TDEFL_READ_UNALIGNED_WORD(const mz_uint8* p)
+{
+	mz_uint16 ret;
+	memcpy(&ret, p, sizeof(mz_uint16));
+	return ret;
+}
+static mz_uint16 TDEFL_READ_UNALIGNED_WORD2(const mz_uint16* p)
+{
+	mz_uint16 ret;
+	memcpy(&ret, p, sizeof(mz_uint16));
+	return ret;
+}
+#else
 #define TDEFL_READ_UNALIGNED_WORD(p) *(const mz_uint16 *)(p)
+#define TDEFL_READ_UNALIGNED_WORD2(p) *(const mz_uint16 *)(p)
+#endif
 static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
 {
     mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
     mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
     const mz_uint16 *s = (const mz_uint16 *)(d->m_dict + pos), *p, *q;
-    mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]), s01 = TDEFL_READ_UNALIGNED_WORD(s);
+    mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]), s01 = TDEFL_READ_UNALIGNED_WORD2(s);
     MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN);
     if (max_match_len <= match_len)
         return;
@@ -1370,14 +1454,14 @@ static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahe
         if (!dist)
             break;
         q = (const mz_uint16 *)(d->m_dict + probe_pos);
-        if (TDEFL_READ_UNALIGNED_WORD(q) != s01)
+        if (TDEFL_READ_UNALIGNED_WORD2(q) != s01)
             continue;
         p = s;
         probe_len = 32;
         do
         {
-        } while ((TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
-                 (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0));
+        } while ((TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) &&
+                 (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (--probe_len > 0));
         if (!probe_len)
         {
             *pMatch_dist = dist;
@@ -1439,7 +1523,17 @@ static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahe
 }
 #endif /* #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES */
 
-#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES &&MINIZ_LITTLE_ENDIAN
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+static mz_uint32 TDEFL_READ_UNALIGNED_WORD32(const mz_uint8* p)
+{
+	mz_uint32 ret;
+	memcpy(&ret, p, sizeof(mz_uint32));
+	return ret;
+}
+#else
+#define TDEFL_READ_UNALIGNED_WORD32(p) *(const mz_uint32 *)(p)
+#endif
 static mz_bool tdefl_compress_fast(tdefl_compressor *d)
 {
     /* Faster, minimally featured LZRW1-style match+parse loop with better register utilization. Intended for applications where raw throughput is valued more highly than ratio. */
@@ -1474,20 +1568,20 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d)
         {
             mz_uint cur_match_dist, cur_match_len = 1;
             mz_uint8 *pCur_dict = d->m_dict + cur_pos;
-            mz_uint first_trigram = (*(const mz_uint32 *)pCur_dict) & 0xFFFFFF;
+            mz_uint first_trigram = TDEFL_READ_UNALIGNED_WORD32(pCur_dict) & 0xFFFFFF;
             mz_uint hash = (first_trigram ^ (first_trigram >> (24 - (TDEFL_LZ_HASH_BITS - 8)))) & TDEFL_LEVEL1_HASH_SIZE_MASK;
             mz_uint probe_pos = d->m_hash[hash];
             d->m_hash[hash] = (mz_uint16)lookahead_pos;
 
-            if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <= dict_size) && ((*(const mz_uint32 *)(d->m_dict + (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) & 0xFFFFFF) == first_trigram))
+            if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <= dict_size) && ((TDEFL_READ_UNALIGNED_WORD32(d->m_dict + (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) & 0xFFFFFF) == first_trigram))
             {
                 const mz_uint16 *p = (const mz_uint16 *)pCur_dict;
                 const mz_uint16 *q = (const mz_uint16 *)(d->m_dict + probe_pos);
                 mz_uint32 probe_len = 32;
                 do
                 {
-                } while ((TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
-                         (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0));
+                } while ((TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) &&
+                         (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (TDEFL_READ_UNALIGNED_WORD2(++p) == TDEFL_READ_UNALIGNED_WORD2(++q)) && (--probe_len > 0));
                 cur_match_len = ((mz_uint)(p - (const mz_uint16 *)pCur_dict) * 2) + (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q);
                 if (!probe_len)
                     cur_match_len = cur_match_dist ? TDEFL_MAX_MATCH_LEN : 0;
@@ -1509,7 +1603,11 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d)
                     cur_match_dist--;
 
                     pLZ_code_buf[0] = (mz_uint8)(cur_match_len - TDEFL_MIN_MATCH_LEN);
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+					memcpy(&pLZ_code_buf[1], &cur_match_dist, sizeof(cur_match_dist));
+#else
                     *(mz_uint16 *)(&pLZ_code_buf[1]) = (mz_uint16)cur_match_dist;
+#endif
                     pLZ_code_buf += 3;
                     *pLZ_flags = (mz_uint8)((*pLZ_flags >> 1) | 0x80);
 
@@ -1648,9 +1746,7 @@ static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d, mz_uint match
     s0 = s_tdefl_small_dist_sym[match_dist & 511];
     s1 = s_tdefl_large_dist_sym[(match_dist >> 8) & 127];
     d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++;
-
-    if (match_len >= TDEFL_MIN_MATCH_LEN)
-        d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
+    d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
 }
 
 static mz_bool tdefl_compress_normal(tdefl_compressor *d)
@@ -1668,7 +1764,7 @@ static mz_bool tdefl_compress_normal(tdefl_compressor *d)
             mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK, ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2;
             mz_uint hash = (d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK];
             mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(src_buf_left, TDEFL_MAX_MATCH_LEN - d->m_lookahead_size);
-            const mz_uint8 *pSrc_end = pSrc + num_bytes_to_process;
+            const mz_uint8 *pSrc_end = pSrc ? pSrc + num_bytes_to_process : NULL;
             src_buf_left -= num_bytes_to_process;
             d->m_lookahead_size += num_bytes_to_process;
             while (pSrc != pSrc_end)
@@ -1853,7 +1949,7 @@ tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pI
     if ((d->m_output_flush_remaining) || (d->m_finished))
         return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
 
-#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES &&MINIZ_LITTLE_ENDIAN
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
     if (((d->m_flags & TDEFL_MAX_PROBES_MASK) == 1) &&
         ((d->m_flags & TDEFL_GREEDY_PARSING_FLAG) != 0) &&
         ((d->m_flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES)) == 0))
@@ -1878,8 +1974,8 @@ tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pI
         d->m_finished = (flush == TDEFL_FINISH);
         if (flush == TDEFL_FULL_FLUSH)
         {
-            MZ_CLEAR_OBJ(d->m_hash);
-            MZ_CLEAR_OBJ(d->m_next);
+            MZ_CLEAR_ARR(d->m_hash);
+            MZ_CLEAR_ARR(d->m_next);
             d->m_dict_size = 0;
         }
     }
@@ -1902,11 +1998,12 @@ tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_fun
     d->m_greedy_parsing = (flags & TDEFL_GREEDY_PARSING_FLAG) != 0;
     d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3;
     if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG))
-        MZ_CLEAR_OBJ(d->m_hash);
+        MZ_CLEAR_ARR(d->m_hash);
     d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size = d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0;
     d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished = d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0;
     d->m_pLZ_code_buf = d->m_lz_code_buf + 1;
     d->m_pLZ_flags = d->m_lz_code_buf;
+    *d->m_pLZ_flags = 0;
     d->m_num_flags_left = 8;
     d->m_pOutput_buf = d->m_output_buf;
     d->m_pOutput_buf_end = d->m_output_buf;
@@ -1921,6 +2018,8 @@ tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_fun
     d->m_pSrc = NULL;
     d->m_src_buf_left = 0;
     d->m_out_buf_ofs = 0;
+    if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG))
+        MZ_CLEAR_ARR(d->m_dict);
     memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
     memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
     return TDEFL_STATUS_OKAY;
@@ -2011,7 +2110,7 @@ size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void
     return out_buf.m_size;
 }
 
-static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500 };
+//static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500 };
 
 /* level may actually range from [0,10] (10 is a "hidden" max level, where we want a bit more compression and it's fine if throughput to fall off a cliff on some files). */
 mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy)
@@ -2081,24 +2180,24 @@ void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int
     *pLen_out = out_buf.m_size - 41;
     {
         static const mz_uint8 chans[] = { 0x00, 0x00, 0x04, 0x02, 0x06 };
-        mz_uint8 pnghdr[41] = { 0x89, 0x50, 0x4e, 0x47, 0x0d, 
-								0x0a, 0x1a, 0x0a, 0x00, 0x00, 
-								0x00, 0x0d, 0x49, 0x48, 0x44, 
-								0x52, 0x00, 0x00, 0x00, 0x00, 
-								0x00, 0x00, 0x00, 0x00, 0x08, 
-								0x00, 0x00, 0x00, 0x00, 0x00, 
-								0x00, 0x00, 0x00, 0x00, 0x00, 
-								0x00, 0x00, 0x49, 0x44, 0x41, 
-								0x54 };
-		pnghdr[18] = (mz_uint8)(w >> 8);
-		pnghdr[19] = (mz_uint8)w;
-		pnghdr[22] = (mz_uint8)(h >> 8);
-		pnghdr[22] = (mz_uint8)h;
-		pnghdr[25] = chans[num_chans];
-		pnghdr[33] = (mz_uint8)(*pLen_out >> 24);
-		pnghdr[34] = (mz_uint8)(*pLen_out >> 16);
-		pnghdr[35] = (mz_uint8)(*pLen_out >> 8);
-		pnghdr[36] = (mz_uint8)*pLen_out;
+        mz_uint8 pnghdr[41] = { 0x89, 0x50, 0x4e, 0x47, 0x0d,
+                                0x0a, 0x1a, 0x0a, 0x00, 0x00,
+                                0x00, 0x0d, 0x49, 0x48, 0x44,
+                                0x52, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x08,
+                                0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x49, 0x44, 0x41,
+                                0x54 };
+        pnghdr[18] = (mz_uint8)(w >> 8);
+        pnghdr[19] = (mz_uint8)w;
+        pnghdr[22] = (mz_uint8)(h >> 8);
+        pnghdr[23] = (mz_uint8)h;
+        pnghdr[25] = chans[num_chans];
+        pnghdr[33] = (mz_uint8)(*pLen_out >> 24);
+        pnghdr[34] = (mz_uint8)(*pLen_out >> 16);
+        pnghdr[35] = (mz_uint8)(*pLen_out >> 8);
+        pnghdr[36] = (mz_uint8)*pLen_out;
         c = (mz_uint32)mz_crc32(MZ_CRC32_INIT, pnghdr + 12, 17);
         for (i = 0; i < 4; ++i, c <<= 8)
             ((mz_uint8 *)(pnghdr + 29))[i] = (mz_uint8)(c >> 24);
@@ -2126,10 +2225,11 @@ void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h,
     return tdefl_write_image_to_png_file_in_memory_ex(pImage, w, h, num_chans, pLen_out, 6, MZ_FALSE);
 }
 
+#ifndef MINIZ_NO_MALLOC
 /* Allocate the tdefl_compressor and tinfl_decompressor structures in C so that */
 /* non-C language bindings to tdefL_ and tinfl_ API don't need to worry about */
 /* structure size and allocation mechanism. */
-tdefl_compressor *tdefl_compressor_alloc()
+tdefl_compressor *tdefl_compressor_alloc(void)
 {
     return (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
 }
@@ -2138,6 +2238,7 @@ void tdefl_compressor_free(tdefl_compressor *pComp)
 {
     MZ_FREE(pComp);
 }
+#endif
 
 #ifdef _MSC_VER
 #pragma warning(pop)
@@ -2146,7 +2247,9 @@ void tdefl_compressor_free(tdefl_compressor *pComp)
 #ifdef __cplusplus
 }
 #endif
-/**************************************************************************
+
+#endif /*#ifndef MINIZ_NO_DEFLATE_APIS*/
+ /**************************************************************************
  *
  * Copyright 2013-2014 RAD Game Tools and Valve Software
  * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
@@ -2174,6 +2277,8 @@ void tdefl_compressor_free(tdefl_compressor *pComp)
 
 
 
+#ifndef MINIZ_NO_INFLATE_APIS
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -2193,8 +2298,7 @@ extern "C" {
         status = result;                     \
         r->m_state = state_index;            \
         goto common_exit;                    \
-        case state_index:                    \
-            ;                                \
+        case state_index:;                   \
     }                                        \
     MZ_MACRO_END
 #define TINFL_CR_RETURN_FOREVER(state_index, result) \
@@ -2208,15 +2312,15 @@ extern "C" {
     MZ_MACRO_END
 #define TINFL_CR_FINISH }
 
-#define TINFL_GET_BYTE(state_index, c)                                                                                                                          \
-    do                                                                                                                                                          \
-    {                                                                                                                                                           \
-        while (pIn_buf_cur >= pIn_buf_end)                                                                                                                      \
-        {                                                                                                                                                       \
-            TINFL_CR_RETURN(state_index, (decomp_flags &TINFL_FLAG_HAS_MORE_INPUT) ? TINFL_STATUS_NEEDS_MORE_INPUT : TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS); \
-        }                                                                                                                                                       \
-        c = *pIn_buf_cur++;                                                                                                                                     \
-    }                                                                                                                                                           \
+#define TINFL_GET_BYTE(state_index, c)                                                                                                                           \
+    do                                                                                                                                                           \
+    {                                                                                                                                                            \
+        while (pIn_buf_cur >= pIn_buf_end)                                                                                                                       \
+        {                                                                                                                                                        \
+            TINFL_CR_RETURN(state_index, (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) ? TINFL_STATUS_NEEDS_MORE_INPUT : TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS); \
+        }                                                                                                                                                        \
+        c = *pIn_buf_cur++;                                                                                                                                      \
+    }                                                                                                                                                            \
     MZ_MACRO_END
 
 #define TINFL_NEED_BITS(state_index, n)                \
@@ -2255,10 +2359,10 @@ extern "C" {
 /* It reads just enough bytes from the input stream that are needed to decode the next Huffman code (and absolutely no more). It works by trying to fully decode a */
 /* Huffman code by using whatever bits are currently present in the bit buffer. If this fails, it reads another byte, and tries again until it succeeds or until the */
 /* bit buffer contains >=15 bits (deflate's max. Huffman code size). */
-#define TINFL_HUFF_BITBUF_FILL(state_index, pHuff)                             \
+#define TINFL_HUFF_BITBUF_FILL(state_index, pLookUp, pTree)                    \
     do                                                                         \
     {                                                                          \
-        temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)];     \
+        temp = pLookUp[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)];                \
         if (temp >= 0)                                                         \
         {                                                                      \
             code_len = temp >> 9;                                              \
@@ -2270,7 +2374,7 @@ extern "C" {
             code_len = TINFL_FAST_LOOKUP_BITS;                                 \
             do                                                                 \
             {                                                                  \
-                temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; \
+                temp = pTree[~temp + ((bit_buf >> code_len++) & 1)];           \
             } while ((temp < 0) && (num_bits >= (code_len + 1)));              \
             if (temp >= 0)                                                     \
                 break;                                                         \
@@ -2286,7 +2390,7 @@ extern "C" {
 /* The slow path is only executed at the very end of the input buffer. */
 /* v1.16: The original macro handled the case at the very end of the passed-in input buffer, but we also need to handle the case where the user passes in 1+zillion bytes */
 /* following the deflate data and our non-conservative read-ahead path won't kick in here on this code. This is much trickier. */
-#define TINFL_HUFF_DECODE(state_index, sym, pHuff)                                                                                  \
+#define TINFL_HUFF_DECODE(state_index, sym, pLookUp, pTree)                                                                         \
     do                                                                                                                              \
     {                                                                                                                               \
         int temp;                                                                                                                   \
@@ -2295,7 +2399,7 @@ extern "C" {
         {                                                                                                                           \
             if ((pIn_buf_end - pIn_buf_cur) < 2)                                                                                    \
             {                                                                                                                       \
-                TINFL_HUFF_BITBUF_FILL(state_index, pHuff);                                                                         \
+                TINFL_HUFF_BITBUF_FILL(state_index, pLookUp, pTree);                                                                \
             }                                                                                                                       \
             else                                                                                                                    \
             {                                                                                                                       \
@@ -2304,14 +2408,14 @@ extern "C" {
                 num_bits += 16;                                                                                                     \
             }                                                                                                                       \
         }                                                                                                                           \
-        if ((temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)                                               \
+        if ((temp = pLookUp[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)                                                          \
             code_len = temp >> 9, temp &= 511;                                                                                      \
         else                                                                                                                        \
         {                                                                                                                           \
             code_len = TINFL_FAST_LOOKUP_BITS;                                                                                      \
             do                                                                                                                      \
             {                                                                                                                       \
-                temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)];                                                      \
+                temp = pTree[~temp + ((bit_buf >> code_len++) & 1)];                                                                \
             } while (temp < 0);                                                                                                     \
         }                                                                                                                           \
         sym = temp;                                                                                                                 \
@@ -2320,29 +2424,53 @@ extern "C" {
     }                                                                                                                               \
     MZ_MACRO_END
 
+static void tinfl_clear_tree(tinfl_decompressor *r)
+{
+    if (r->m_type == 0)
+        MZ_CLEAR_ARR(r->m_tree_0);
+    else if (r->m_type == 1)
+        MZ_CLEAR_ARR(r->m_tree_1);
+    else
+        MZ_CLEAR_ARR(r->m_tree_2);
+}
+
 tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags)
 {
-    static const int s_length_base[31] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0 };
-    static const int s_length_extra[31] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0 };
-    static const int s_dist_base[32] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0 };
-    static const int s_dist_extra[32] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 };
+    static const mz_uint16 s_length_base[31] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0 };
+    static const mz_uint8 s_length_extra[31] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0 };
+    static const mz_uint16 s_dist_base[32] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0 };
+    static const mz_uint8 s_dist_extra[32] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 };
     static const mz_uint8 s_length_dezigzag[19] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
-    static const int s_min_table_sizes[3] = { 257, 1, 4 };
+    static const mz_uint16 s_min_table_sizes[3] = { 257, 1, 4 };
+
+    mz_int16 *pTrees[3];
+    mz_uint8 *pCode_sizes[3];
 
     tinfl_status status = TINFL_STATUS_FAILED;
     mz_uint32 num_bits, dist, counter, num_extra;
     tinfl_bit_buf_t bit_buf;
     const mz_uint8 *pIn_buf_cur = pIn_buf_next, *const pIn_buf_end = pIn_buf_next + *pIn_buf_size;
-    mz_uint8 *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = pOut_buf_next + *pOut_buf_size;
-    size_t out_buf_size_mask = (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) ? (size_t) - 1 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, dist_from_out_buf_start;
+    mz_uint8 *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = pOut_buf_next ? pOut_buf_next + *pOut_buf_size : NULL;
+    size_t out_buf_size_mask = (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) ? (size_t)-1 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, dist_from_out_buf_start;
 
     /* Ensure the output buffer's size is a power of 2, unless the output buffer is large enough to hold the entire output file (in which case it doesn't matter). */
+    if ((!pOut_buf_start) || (!pOut_buf_next) || (!pIn_buf_size) || (!pOut_buf_size))
+    {
+        return TINFL_STATUS_BAD_PARAM;
+    }
     if (((out_buf_size_mask + 1) & out_buf_size_mask) || (pOut_buf_next < pOut_buf_start))
     {
         *pIn_buf_size = *pOut_buf_size = 0;
         return TINFL_STATUS_BAD_PARAM;
     }
 
+    pTrees[0] = r->m_tree_0;
+    pTrees[1] = r->m_tree_1;
+    pTrees[2] = r->m_tree_2;
+    pCode_sizes[0] = r->m_code_size_0;
+    pCode_sizes[1] = r->m_code_size_1;
+    pCode_sizes[2] = r->m_code_size_2;
+
     num_bits = r->m_num_bits;
     bit_buf = r->m_bit_buf;
     dist = r->m_dist;
@@ -2359,7 +2487,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
         TINFL_GET_BYTE(2, r->m_zhdr1);
         counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8));
         if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
-            counter |= (mz_uint32)(((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (mz_uint32)(1U << (8U + (r->m_zhdr0 >> 4)))));
+            counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (size_t)((size_t)1 << (8U + (r->m_zhdr0 >> 4)))));
         if (counter)
         {
             TINFL_CR_RETURN_FOREVER(36, TINFL_STATUS_FAILED);
@@ -2420,11 +2548,11 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
         {
             if (r->m_type == 1)
             {
-                mz_uint8 *p = r->m_tables[0].m_code_size;
+                mz_uint8 *p = r->m_code_size_0;
                 mz_uint i;
                 r->m_table_sizes[0] = 288;
                 r->m_table_sizes[1] = 32;
-                TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
+                TINFL_MEMSET(r->m_code_size_1, 5, 32);
                 for (i = 0; i <= 143; ++i)
                     *p++ = 8;
                 for (; i <= 255; ++i)
@@ -2441,26 +2569,30 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]);
                     r->m_table_sizes[counter] += s_min_table_sizes[counter];
                 }
-                MZ_CLEAR_OBJ(r->m_tables[2].m_code_size);
+                MZ_CLEAR_ARR(r->m_code_size_2);
                 for (counter = 0; counter < r->m_table_sizes[2]; counter++)
                 {
                     mz_uint s;
                     TINFL_GET_BITS(14, s, 3);
-                    r->m_tables[2].m_code_size[s_length_dezigzag[counter]] = (mz_uint8)s;
+                    r->m_code_size_2[s_length_dezigzag[counter]] = (mz_uint8)s;
                 }
                 r->m_table_sizes[2] = 19;
             }
             for (; (int)r->m_type >= 0; r->m_type--)
             {
                 int tree_next, tree_cur;
-                tinfl_huff_table *pTable;
+                mz_int16 *pLookUp;
+                mz_int16 *pTree;
+                mz_uint8 *pCode_size;
                 mz_uint i, j, used_syms, total, sym_index, next_code[17], total_syms[16];
-                pTable = &r->m_tables[r->m_type];
-                MZ_CLEAR_OBJ(total_syms);
-                MZ_CLEAR_OBJ(pTable->m_look_up);
-                MZ_CLEAR_OBJ(pTable->m_tree);
+                pLookUp = r->m_look_up[r->m_type];
+                pTree = pTrees[r->m_type];
+                pCode_size = pCode_sizes[r->m_type];
+                MZ_CLEAR_ARR(total_syms);
+                TINFL_MEMSET(pLookUp, 0, sizeof(r->m_look_up[0]));
+                tinfl_clear_tree(r);
                 for (i = 0; i < r->m_table_sizes[r->m_type]; ++i)
-                    total_syms[pTable->m_code_size[i]]++;
+                    total_syms[pCode_size[i]]++;
                 used_syms = 0, total = 0;
                 next_code[0] = next_code[1] = 0;
                 for (i = 1; i <= 15; ++i)
@@ -2474,7 +2606,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                 }
                 for (tree_next = -1, sym_index = 0; sym_index < r->m_table_sizes[r->m_type]; ++sym_index)
                 {
-                    mz_uint rev_code = 0, l, cur_code, code_size = pTable->m_code_size[sym_index];
+                    mz_uint rev_code = 0, l, cur_code, code_size = pCode_size[sym_index];
                     if (!code_size)
                         continue;
                     cur_code = next_code[code_size]++;
@@ -2485,14 +2617,14 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                         mz_int16 k = (mz_int16)((code_size << 9) | sym_index);
                         while (rev_code < TINFL_FAST_LOOKUP_SIZE)
                         {
-                            pTable->m_look_up[rev_code] = k;
+                            pLookUp[rev_code] = k;
                             rev_code += (1 << code_size);
                         }
                         continue;
                     }
-                    if (0 == (tree_cur = pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)]))
+                    if (0 == (tree_cur = pLookUp[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)]))
                     {
-                        pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] = (mz_int16)tree_next;
+                        pLookUp[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] = (mz_int16)tree_next;
                         tree_cur = tree_next;
                         tree_next -= 2;
                     }
@@ -2500,24 +2632,24 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     for (j = code_size; j > (TINFL_FAST_LOOKUP_BITS + 1); j--)
                     {
                         tree_cur -= ((rev_code >>= 1) & 1);
-                        if (!pTable->m_tree[-tree_cur - 1])
+                        if (!pTree[-tree_cur - 1])
                         {
-                            pTable->m_tree[-tree_cur - 1] = (mz_int16)tree_next;
+                            pTree[-tree_cur - 1] = (mz_int16)tree_next;
                             tree_cur = tree_next;
                             tree_next -= 2;
                         }
                         else
-                            tree_cur = pTable->m_tree[-tree_cur - 1];
+                            tree_cur = pTree[-tree_cur - 1];
                     }
                     tree_cur -= ((rev_code >>= 1) & 1);
-                    pTable->m_tree[-tree_cur - 1] = (mz_int16)sym_index;
+                    pTree[-tree_cur - 1] = (mz_int16)sym_index;
                 }
                 if (r->m_type == 2)
                 {
                     for (counter = 0; counter < (r->m_table_sizes[0] + r->m_table_sizes[1]);)
                     {
                         mz_uint s;
-                        TINFL_HUFF_DECODE(16, dist, &r->m_tables[2]);
+                        TINFL_HUFF_DECODE(16, dist, r->m_look_up[2], r->m_tree_2);
                         if (dist < 16)
                         {
                             r->m_len_codes[counter++] = (mz_uint8)dist;
@@ -2537,8 +2669,8 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     {
                         TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
                     }
-                    TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]);
-                    TINFL_MEMCPY(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
+                    TINFL_MEMCPY(r->m_code_size_0, r->m_len_codes, r->m_table_sizes[0]);
+                    TINFL_MEMCPY(r->m_code_size_1, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
                 }
             }
             for (;;)
@@ -2548,7 +2680,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                 {
                     if (((pIn_buf_end - pIn_buf_cur) < 4) || ((pOut_buf_end - pOut_buf_cur) < 2))
                     {
-                        TINFL_HUFF_DECODE(23, counter, &r->m_tables[0]);
+                        TINFL_HUFF_DECODE(23, counter, r->m_look_up[0], r->m_tree_0);
                         if (counter >= 256)
                             break;
                         while (pOut_buf_cur >= pOut_buf_end)
@@ -2576,14 +2708,14 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                             num_bits += 16;
                         }
 #endif
-                        if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+                        if ((sym2 = r->m_look_up[0][bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
                             code_len = sym2 >> 9;
                         else
                         {
                             code_len = TINFL_FAST_LOOKUP_BITS;
                             do
                             {
-                                sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+                                sym2 = r->m_tree_0[~sym2 + ((bit_buf >> code_len++) & 1)];
                             } while (sym2 < 0);
                         }
                         counter = sym2;
@@ -2600,14 +2732,14 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                             num_bits += 16;
                         }
 #endif
-                        if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+                        if ((sym2 = r->m_look_up[0][bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
                             code_len = sym2 >> 9;
                         else
                         {
                             code_len = TINFL_FAST_LOOKUP_BITS;
                             do
                             {
-                                sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+                                sym2 = r->m_tree_0[~sym2 + ((bit_buf >> code_len++) & 1)];
                             } while (sym2 < 0);
                         }
                         bit_buf >>= code_len;
@@ -2636,7 +2768,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     counter += extra_bits;
                 }
 
-                TINFL_HUFF_DECODE(26, dist, &r->m_tables[1]);
+                TINFL_HUFF_DECODE(26, dist, r->m_look_up[1], r->m_tree_1);
                 num_extra = s_dist_extra[dist];
                 dist = s_dist_base[dist];
                 if (num_extra)
@@ -2647,7 +2779,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                 }
 
                 dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start;
-                if ((dist > dist_from_out_buf_start) && (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+                if ((dist == 0 || dist > dist_from_out_buf_start || dist_from_out_buf_start == 0) && (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
                 {
                     TINFL_CR_RETURN_FOREVER(37, TINFL_STATUS_FAILED);
                 }
@@ -2672,8 +2804,12 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     const mz_uint8 *pSrc_end = pSrc + (counter & ~7);
                     do
                     {
+#ifdef MINIZ_UNALIGNED_USE_MEMCPY
+						memcpy(pOut_buf_cur, pSrc, sizeof(mz_uint32)*2);
+#else
                         ((mz_uint32 *)pOut_buf_cur)[0] = ((const mz_uint32 *)pSrc)[0];
                         ((mz_uint32 *)pOut_buf_cur)[1] = ((const mz_uint32 *)pSrc)[1];
+#endif
                         pOut_buf_cur += 8;
                     } while ((pSrc += 8) < pSrc_end);
                     if ((counter &= 7) < 3)
@@ -2689,18 +2825,19 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
                     }
                 }
 #endif
-                do
+                while(counter>2)
                 {
                     pOut_buf_cur[0] = pSrc[0];
                     pOut_buf_cur[1] = pSrc[1];
                     pOut_buf_cur[2] = pSrc[2];
                     pOut_buf_cur += 3;
                     pSrc += 3;
-                } while ((int)(counter -= 3) > 2);
-                if ((int)counter > 0)
+					counter -= 3;
+                }
+                if (counter > 0)
                 {
                     pOut_buf_cur[0] = pSrc[0];
-                    if ((int)counter > 1)
+                    if (counter > 1)
                         pOut_buf_cur[1] = pSrc[1];
                     pOut_buf_cur += counter;
                 }
@@ -2716,7 +2853,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex
         --pIn_buf_cur;
         num_bits -= 8;
     }
-    bit_buf &= (tinfl_bit_buf_t)(( ((mz_uint64)1) << num_bits) - (mz_uint64)1);
+    bit_buf &= ~(~(tinfl_bit_buf_t)0 << num_bits);
     MZ_ASSERT(!num_bits); /* if this assert fires then we've read beyond the end of non-deflate/zlib streams with following data (such as gzip streams). */
 
     if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
@@ -2748,7 +2885,7 @@ common_exit:
         }
     }
     r->m_num_bits = num_bits;
-    r->m_bit_buf = bit_buf & (tinfl_bit_buf_t)(( ((mz_uint64)1) << num_bits) - (mz_uint64)1);
+    r->m_bit_buf = bit_buf & ~(~(tinfl_bit_buf_t)0 << num_bits);
     r->m_dist = dist;
     r->m_counter = counter;
     r->m_num_extra = num_extra;
@@ -2843,6 +2980,7 @@ int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size,
     size_t in_buf_ofs = 0, dict_ofs = 0;
     if (!pDict)
         return TINFL_STATUS_FAILED;
+    memset(pDict,0,TINFL_LZ_DICT_SIZE);
     tinfl_init(&decomp);
     for (;;)
     {
@@ -2864,7 +3002,8 @@ int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size,
     return result;
 }
 
-tinfl_decompressor *tinfl_decompressor_alloc()
+#ifndef MINIZ_NO_MALLOC
+tinfl_decompressor *tinfl_decompressor_alloc(void)
 {
     tinfl_decompressor *pDecomp = (tinfl_decompressor *)MZ_MALLOC(sizeof(tinfl_decompressor));
     if (pDecomp)
@@ -2876,11 +3015,14 @@ void tinfl_decompressor_free(tinfl_decompressor *pDecomp)
 {
     MZ_FREE(pDecomp);
 }
+#endif
 
 #ifdef __cplusplus
 }
 #endif
-/**************************************************************************
+
+#endif /*#ifndef MINIZ_NO_INFLATE_APIS*/
+ /**************************************************************************
  *
  * Copyright 2013-2014 RAD Game Tools and Valve Software
  * Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
@@ -2921,20 +3063,59 @@ extern "C" {
 #else
 #include <sys/stat.h>
 
-#if defined(_MSC_VER) || defined(__MINGW64__)
+#if defined(_MSC_VER) || defined(__MINGW64__) || defined(__MINGW32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+static WCHAR* mz_utf8z_to_widechar(const char* str)
+{
+  int reqChars = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
+  WCHAR* wStr = (WCHAR*)malloc(reqChars * sizeof(WCHAR));
+  MultiByteToWideChar(CP_UTF8, 0, str, -1, wStr, reqChars);
+  return wStr;
+}
+
 static FILE *mz_fopen(const char *pFilename, const char *pMode)
 {
-    FILE *pFile = NULL;
-    fopen_s(&pFile, pFilename, pMode);
-    return pFile;
+  WCHAR* wFilename = mz_utf8z_to_widechar(pFilename);
+  WCHAR* wMode = mz_utf8z_to_widechar(pMode);
+  FILE* pFile = NULL;
+  errno_t err = _wfopen_s(&pFile, wFilename, wMode);
+  free(wFilename);
+  free(wMode);
+  return err ? NULL : pFile;
 }
+
 static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
 {
-    FILE *pFile = NULL;
-    if (freopen_s(&pFile, pPath, pMode, pStream))
-        return NULL;
-    return pFile;
+  WCHAR* wPath = mz_utf8z_to_widechar(pPath);
+  WCHAR* wMode = mz_utf8z_to_widechar(pMode);
+  FILE* pFile = NULL;
+  errno_t err = _wfreopen_s(&pFile, wPath, wMode, pStream);
+  free(wPath);
+  free(wMode);
+  return err ? NULL : pFile;
 }
+
+#if defined(__MINGW32__)
+static int mz_stat(const char *path, struct _stat *buffer)
+{
+  WCHAR* wPath = mz_utf8z_to_widechar(path);
+  int res = _wstat(wPath, buffer);
+  free(wPath);
+  return res;
+}
+#else
+static int mz_stat64(const char *path, struct __stat64 *buffer)
+{
+  WCHAR* wPath = mz_utf8z_to_widechar(path);
+  int res = _wstat64(wPath, buffer);
+  free(wPath);
+  return res;
+}
+#endif
+
 #ifndef MINIZ_NO_TIME
 #include <sys/utime.h>
 #endif
@@ -2944,12 +3125,18 @@ static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
 #define MZ_FWRITE fwrite
 #define MZ_FTELL64 _ftelli64
 #define MZ_FSEEK64 _fseeki64
+#if defined(__MINGW32__)
 #define MZ_FILE_STAT_STRUCT _stat
-#define MZ_FILE_STAT _stat
+#define MZ_FILE_STAT mz_stat
+#else
+#define MZ_FILE_STAT_STRUCT _stat64
+#define MZ_FILE_STAT mz_stat64 
+#endif
 #define MZ_FFLUSH fflush
 #define MZ_FREOPEN mz_freopen
 #define MZ_DELETE_FILE remove
-#elif defined(__MINGW32__)
+
+#elif defined(__WATCOMC__)
 #ifndef MINIZ_NO_TIME
 #include <sys/utime.h>
 #endif
@@ -2957,13 +3144,14 @@ static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
 #define MZ_FCLOSE fclose
 #define MZ_FREAD fread
 #define MZ_FWRITE fwrite
-#define MZ_FTELL64 ftello64
-#define MZ_FSEEK64 fseeko64
-#define MZ_FILE_STAT_STRUCT _stat
-#define MZ_FILE_STAT _stat
+#define MZ_FTELL64 _ftelli64
+#define MZ_FSEEK64 _fseeki64
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
 #define MZ_FFLUSH fflush
 #define MZ_FREOPEN(f, m, s) freopen(f, m, s)
 #define MZ_DELETE_FILE remove
+
 #elif defined(__TINYC__)
 #ifndef MINIZ_NO_TIME
 #include <sys/utime.h>
@@ -2979,7 +3167,8 @@ static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
 #define MZ_FFLUSH fflush
 #define MZ_FREOPEN(f, m, s) freopen(f, m, s)
 #define MZ_DELETE_FILE remove
-#elif defined(__GNUC__) && _LARGEFILE64_SOURCE
+
+#elif defined(__USE_LARGEFILE64) /* gcc, clang */
 #ifndef MINIZ_NO_TIME
 #include <utime.h>
 #endif
@@ -2994,7 +3183,8 @@ static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream)
 #define MZ_FFLUSH fflush
 #define MZ_FREOPEN(p, m, s) freopen64(p, m, s)
 #define MZ_DELETE_FILE remove
-#elif defined(__APPLE__) && _LARGEFILE64_SOURCE
+
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 #ifndef MINIZ_NO_TIME
 #include <utime.h>
 #endif
@@ -3140,7 +3330,7 @@ struct mz_zip_internal_state_tag
     mz_zip_array m_sorted_central_dir_offsets;
 
     /* The flags passed in when the archive is initially opened. */
-    uint32_t m_init_flags;
+    mz_uint32 m_init_flags;
 
     /* MZ_TRUE if the archive has a zip64 end of central directory headers, etc. */
     mz_bool m_zip64;
@@ -3159,7 +3349,7 @@ struct mz_zip_internal_state_tag
 
 #define MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(array_ptr, element_size) (array_ptr)->m_element_size = element_size
 
-#if defined(DEBUG) || defined(_DEBUG) || defined(NDEBUG)
+#if defined(DEBUG) || defined(_DEBUG)
 static MZ_FORCEINLINE mz_uint mz_zip_array_range_check(const mz_zip_array *pArray, mz_uint index)
 {
     MZ_ASSERT(index < pArray->m_size);
@@ -3233,7 +3423,8 @@ static MZ_FORCEINLINE mz_bool mz_zip_array_push_back(mz_zip_archive *pZip, mz_zi
     size_t orig_size = pArray->m_size;
     if (!mz_zip_array_resize(pZip, pArray, orig_size + n, MZ_TRUE))
         return MZ_FALSE;
-    memcpy((mz_uint8 *)pArray->m_p + orig_size * pArray->m_element_size, pElements, n * pArray->m_element_size);
+    if (n > 0)
+        memcpy((mz_uint8 *)pArray->m_p + orig_size * pArray->m_element_size, pElements, n * pArray->m_element_size);
     return MZ_TRUE;
 }
 
@@ -3461,7 +3652,7 @@ static mz_bool mz_zip_reader_locate_header_sig(mz_zip_archive *pZip, mz_uint32 r
         }
 
         /* Give up if we've searched the entire file, or we've gone back "too far" (~64kb) */
-        if ((!cur_file_ofs) || ((pZip->m_archive_size - cur_file_ofs) >= (MZ_UINT16_MAX + record_size)))
+        if ((!cur_file_ofs) || ((pZip->m_archive_size - cur_file_ofs) >= ((mz_uint64)(MZ_UINT16_MAX) + record_size)))
             return MZ_FALSE;
 
         cur_file_ofs = MZ_MAX(cur_file_ofs - (sizeof(buf_u32) - 3), 0);
@@ -3575,7 +3766,7 @@ static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_uint flag
     if (((num_this_disk | cdir_disk_index) != 0) && ((num_this_disk != 1) || (cdir_disk_index != 1)))
         return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_MULTIDISK);
 
-    if (cdir_size < pZip->m_total_files * MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)
+    if (cdir_size < (mz_uint64)pZip->m_total_files * MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
     if ((cdir_ofs + (mz_uint64)cdir_size) > pZip->m_archive_size)
@@ -3630,21 +3821,47 @@ static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_uint flag
 
                 if (extra_size_remaining)
                 {
-                    const mz_uint8 *pExtra_data = p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size;
+					const mz_uint8 *pExtra_data;
+					void* buf = NULL;
+
+					if (MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size + ext_data_size > n)
+					{
+						buf = MZ_MALLOC(ext_data_size);
+						if(buf==NULL)
+							return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+
+						if (pZip->m_pRead(pZip->m_pIO_opaque, cdir_ofs + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size, buf, ext_data_size) != ext_data_size)
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+						}
+
+						pExtra_data = (mz_uint8*)buf;
+					}
+					else
+					{
+						pExtra_data = p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size;
+					}
 
                     do
                     {
-						mz_uint32 field_id;
-						mz_uint32 field_data_size;
+                        mz_uint32 field_id;
+                        mz_uint32 field_data_size;
 
-                        if (extra_size_remaining < (sizeof(mz_uint16) * 2))
-                            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						if (extra_size_remaining < (sizeof(mz_uint16) * 2))
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						}
 
                         field_id = MZ_READ_LE16(pExtra_data);
                         field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
 
-                        if ((field_data_size + sizeof(mz_uint16) * 2) > extra_size_remaining)
-                            return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						if ((field_data_size + sizeof(mz_uint16) * 2) > extra_size_remaining)
+						{
+							MZ_FREE(buf);
+							return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+						}
 
                         if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
                         {
@@ -3657,6 +3874,8 @@ static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_uint flag
                         pExtra_data += sizeof(mz_uint16) * 2 + field_data_size;
                         extra_size_remaining = extra_size_remaining - sizeof(mz_uint16) * 2 - field_data_size;
                     } while (extra_size_remaining);
+
+					MZ_FREE(buf);
                 }
             }
 
@@ -3698,7 +3917,7 @@ static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_uint flag
 void mz_zip_zero_struct(mz_zip_archive *pZip)
 {
     if (pZip)
-        MZ_CLEAR_OBJ(*pZip);
+        MZ_CLEAR_PTR(pZip);
 }
 
 static mz_bool mz_zip_reader_end_internal(mz_zip_archive *pZip, mz_bool set_last_error)
@@ -3795,7 +4014,7 @@ mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, size_t si
     pZip->m_archive_size = size;
     pZip->m_pRead = mz_zip_mem_read_func;
     pZip->m_pIO_opaque = pZip;
-	pZip->m_pNeeds_keepalive = NULL;
+    pZip->m_pNeeds_keepalive = NULL;
 
 #ifdef __cplusplus
     pZip->m_pState->m_pMem = const_cast<void *>(pMem);
@@ -3835,13 +4054,13 @@ mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename, mz_
 
 mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags, mz_uint64 file_start_ofs, mz_uint64 archive_size)
 {
-	mz_uint64 file_size;
-	MZ_FILE *pFile;
+    mz_uint64 file_size;
+    MZ_FILE *pFile;
 
     if ((!pZip) || (!pFilename) || ((archive_size) && (archive_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)))
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
 
-	pFile = MZ_FOPEN(pFilename, "rb");
+    pFile = MZ_FOPEN(pFilename, "rb");
     if (!pFile)
         return mz_zip_set_error(pZip, MZ_ZIP_FILE_OPEN_FAILED);
 
@@ -3860,7 +4079,10 @@ mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename,
     /* TODO: Better sanity check archive_size and the # of actual remaining bytes */
 
     if (file_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+    {
+	MZ_FCLOSE(pFile);
         return mz_zip_set_error(pZip, MZ_ZIP_NOT_AN_ARCHIVE);
+    }
 
     if (!mz_zip_reader_init_internal(pZip, flags))
     {
@@ -4073,8 +4295,8 @@ static mz_bool mz_zip_file_stat_internal(mz_zip_archive *pZip, mz_uint file_inde
 
             do
             {
-				mz_uint32 field_id;
-				mz_uint32 field_data_size;
+                mz_uint32 field_id;
+                mz_uint32 field_data_size;
 
                 if (extra_size_remaining < (sizeof(mz_uint16) * 2))
                     return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
@@ -4169,7 +4391,7 @@ static mz_bool mz_zip_locate_file_binary_search(mz_zip_archive *pZip, const char
     const mz_zip_array *pCentral_dir_offsets = &pState->m_central_dir_offsets;
     const mz_zip_array *pCentral_dir = &pState->m_central_dir;
     mz_uint32 *pIndices = &MZ_ZIP_ARRAY_ELEMENT(&pState->m_sorted_central_dir_offsets, mz_uint32, 0);
-    const uint32_t size = pZip->m_total_files;
+    const mz_uint32 size = pZip->m_total_files;
     const mz_uint filename_len = (mz_uint)strlen(pFilename);
 
     if (pIndex)
@@ -4184,7 +4406,7 @@ static mz_bool mz_zip_locate_file_binary_search(mz_zip_archive *pZip, const char
         while (l <= h)
         {
             mz_int64 m = l + ((h - l) >> 1);
-            uint32_t file_index = pIndices[(uint32_t)m];
+            mz_uint32 file_index = pIndices[(mz_uint32)m];
 
             int comp = mz_zip_filename_compare(pCentral_dir, pCentral_dir_offsets, file_index, pFilename, filename_len);
             if (!comp)
@@ -4277,7 +4499,8 @@ mz_bool mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *pName, co
     return mz_zip_set_error(pZip, MZ_ZIP_FILE_NOT_FOUND);
 }
 
-mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size)
+static
+mz_bool mz_zip_reader_extract_to_mem_no_alloc1(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size, const mz_zip_archive_file_stat *st)
 {
     int status = TINFL_STATUS_DONE;
     mz_uint64 needed_size, cur_file_ofs, comp_remaining, out_buf_ofs = 0, read_buf_size, read_buf_ofs = 0, read_buf_avail;
@@ -4290,6 +4513,9 @@ mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file
     if ((!pZip) || (!pZip->m_pState) || ((buf_size) && (!pBuf)) || ((user_read_buf_size) && (!pUser_read_buf)) || (!pZip->m_pRead))
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
 
+    if (st) {
+        file_stat = *st;
+    } else
     if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
         return MZ_FALSE;
 
@@ -4318,7 +4544,7 @@ mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file
     if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
-    cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    cur_file_ofs += (mz_uint64)(MZ_ZIP_LOCAL_DIR_HEADER_SIZE) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
     if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
@@ -4420,17 +4646,22 @@ mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file
     return status == TINFL_STATUS_DONE;
 }
 
+mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size)
+{
+    return mz_zip_reader_extract_to_mem_no_alloc1(pZip, file_index, pBuf, buf_size, flags, pUser_read_buf, user_read_buf_size, NULL);
+}
+
 mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size)
 {
     mz_uint32 file_index;
     if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
         return MZ_FALSE;
-    return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size, flags, pUser_read_buf, user_read_buf_size);
+    return mz_zip_reader_extract_to_mem_no_alloc1(pZip, file_index, pBuf, buf_size, flags, pUser_read_buf, user_read_buf_size, NULL);
 }
 
 mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags)
 {
-    return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size, flags, NULL, 0);
+    return mz_zip_reader_extract_to_mem_no_alloc1(pZip, file_index, pBuf, buf_size, flags, NULL, 0, NULL);
 }
 
 mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags)
@@ -4440,23 +4671,17 @@ mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFil
 
 void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, size_t *pSize, mz_uint flags)
 {
-    mz_uint64 comp_size, uncomp_size, alloc_size;
-    const mz_uint8 *p = mz_zip_get_cdh(pZip, file_index);
+    mz_zip_archive_file_stat file_stat;
+    mz_uint64 alloc_size;
     void *pBuf;
 
     if (pSize)
         *pSize = 0;
 
-    if (!p)
-    {
-        mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
+    if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat))
         return NULL;
-    }
 
-    comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
-    uncomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
-
-    alloc_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? comp_size : uncomp_size;
+    alloc_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? file_stat.m_comp_size : file_stat.m_uncomp_size;
     if (((sizeof(size_t) == sizeof(mz_uint32))) && (alloc_size > 0x7FFFFFFF))
     {
         mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
@@ -4469,7 +4694,7 @@ void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, si
         return NULL;
     }
 
-    if (!mz_zip_reader_extract_to_mem(pZip, file_index, pBuf, (size_t)alloc_size, flags))
+    if (!mz_zip_reader_extract_to_mem_no_alloc1(pZip, file_index, pBuf, (size_t)alloc_size, flags, NULL, 0, &file_stat))
     {
         pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
         return NULL;
@@ -4495,7 +4720,9 @@ void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip, const char *pFile
 mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_index, mz_file_write_func pCallback, void *pOpaque, mz_uint flags)
 {
     int status = TINFL_STATUS_DONE;
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
     mz_uint file_crc32 = MZ_CRC32_INIT;
+#endif
     mz_uint64 read_buf_size, read_buf_ofs = 0, read_buf_avail, comp_remaining, out_buf_ofs = 0, cur_file_ofs;
     mz_zip_archive_file_stat file_stat;
     void *pRead_buf = NULL;
@@ -4529,7 +4756,7 @@ mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_ind
     if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
-    cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    cur_file_ofs += (mz_uint64)(MZ_ZIP_LOCAL_DIR_HEADER_SIZE) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
     if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
@@ -4699,6 +4926,304 @@ mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, const char
     return mz_zip_reader_extract_to_callback(pZip, file_index, pCallback, pOpaque, flags);
 }
 
+mz_zip_reader_extract_iter_state* mz_zip_reader_extract_iter_new(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags)
+{
+    mz_zip_reader_extract_iter_state *pState;
+    mz_uint32 local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) / sizeof(mz_uint32)];
+    mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+
+    /* Argument sanity check */
+    if ((!pZip) || (!pZip->m_pState))
+        return NULL;
+
+    /* Allocate an iterator status structure */
+    pState = (mz_zip_reader_extract_iter_state*)pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_reader_extract_iter_state));
+    if (!pState)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        return NULL;
+    }
+
+    /* Fetch file details */
+    if (!mz_zip_reader_file_stat(pZip, file_index, &pState->file_stat))
+    {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Encryption and patch files are not supported. */
+    if (pState->file_stat.m_bit_flag & (MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_IS_ENCRYPTED | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_USES_STRONG_ENCRYPTION | MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_COMPRESSED_PATCH_FLAG))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_ENCRYPTION);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* This function only supports decompressing stored and deflate. */
+    if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (pState->file_stat.m_method != 0) && (pState->file_stat.m_method != MZ_DEFLATED))
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_METHOD);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Init state - save args */
+    pState->pZip = pZip;
+    pState->flags = flags;
+
+    /* Init state - reset variables to defaults */
+    pState->status = TINFL_STATUS_DONE;
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+    pState->file_crc32 = MZ_CRC32_INIT;
+#endif
+    pState->read_buf_ofs = 0;
+    pState->out_buf_ofs = 0;
+    pState->pRead_buf = NULL;
+    pState->pWrite_buf = NULL;
+    pState->out_blk_remain = 0;
+
+    /* Read and parse the local directory entry. */
+    pState->cur_file_ofs = pState->file_stat.m_local_header_ofs;
+    if (pZip->m_pRead(pZip->m_pIO_opaque, pState->cur_file_ofs, pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) != MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    pState->cur_file_ofs += (mz_uint64)(MZ_ZIP_LOCAL_DIR_HEADER_SIZE) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) + MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+    if ((pState->cur_file_ofs + pState->file_stat.m_comp_size) > pZip->m_archive_size)
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+        return NULL;
+    }
+
+    /* Decompress the file either directly from memory or from a file input buffer. */
+    if (pZip->m_pState->m_pMem)
+    {
+        pState->pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + pState->cur_file_ofs;
+        pState->read_buf_size = pState->read_buf_avail = pState->file_stat.m_comp_size;
+        pState->comp_remaining = pState->file_stat.m_comp_size;
+    }
+    else
+    {
+        if (!((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method)))
+        {
+            /* Decompression required, therefore intermediate read buffer required */
+            pState->read_buf_size = MZ_MIN(pState->file_stat.m_comp_size, (mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE);
+            if (NULL == (pState->pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)pState->read_buf_size)))
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+                return NULL;
+            }
+        }
+        else
+        {
+            /* Decompression not required - we will be reading directly into user buffer, no temp buf required */
+            pState->read_buf_size = 0;
+        }
+        pState->read_buf_avail = 0;
+        pState->comp_remaining = pState->file_stat.m_comp_size;
+    }
+
+    if (!((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method)))
+    {
+        /* Decompression required, init decompressor */
+        tinfl_init( &pState->inflator );
+
+        /* Allocate write buffer */
+        if (NULL == (pState->pWrite_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, TINFL_LZ_DICT_SIZE)))
+        {
+            mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+            if (pState->pRead_buf)
+                pZip->m_pFree(pZip->m_pAlloc_opaque, pState->pRead_buf);
+            pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+            return NULL;
+        }
+    }
+
+    return pState;
+}
+
+mz_zip_reader_extract_iter_state* mz_zip_reader_extract_file_iter_new(mz_zip_archive *pZip, const char *pFilename, mz_uint flags)
+{
+    mz_uint32 file_index;
+
+    /* Locate file index by name */
+    if (!mz_zip_reader_locate_file_v2(pZip, pFilename, NULL, flags, &file_index))
+        return NULL;
+
+    /* Construct iterator */
+    return mz_zip_reader_extract_iter_new(pZip, file_index, flags);
+}
+
+size_t mz_zip_reader_extract_iter_read(mz_zip_reader_extract_iter_state* pState, void* pvBuf, size_t buf_size)
+{
+    size_t copied_to_caller = 0;
+
+    /* Argument sanity check */
+    if ((!pState) || (!pState->pZip) || (!pState->pZip->m_pState) || (!pvBuf))
+        return 0;
+
+    if ((pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!pState->file_stat.m_method))
+    {
+        /* The file is stored or the caller has requested the compressed data, calc amount to return. */
+        copied_to_caller = (size_t)MZ_MIN( buf_size, pState->comp_remaining );
+
+        /* Zip is in memory....or requires reading from a file? */
+        if (pState->pZip->m_pState->m_pMem)
+        {
+            /* Copy data to caller's buffer */
+            memcpy( pvBuf, pState->pRead_buf, copied_to_caller );
+            pState->pRead_buf = ((mz_uint8*)pState->pRead_buf) + copied_to_caller;
+        }
+        else
+        {
+            /* Read directly into caller's buffer */
+            if (pState->pZip->m_pRead(pState->pZip->m_pIO_opaque, pState->cur_file_ofs, pvBuf, copied_to_caller) != copied_to_caller)
+            {
+                /* Failed to read all that was asked for, flag failure and alert user */
+                mz_zip_set_error(pState->pZip, MZ_ZIP_FILE_READ_FAILED);
+                pState->status = TINFL_STATUS_FAILED;
+                copied_to_caller = 0;
+            }
+        }
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        /* Compute CRC if not returning compressed data only */
+        if (!(pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+            pState->file_crc32 = (mz_uint32)mz_crc32(pState->file_crc32, (const mz_uint8 *)pvBuf, copied_to_caller);
+#endif
+
+        /* Advance offsets, dec counters */
+        pState->cur_file_ofs += copied_to_caller;
+        pState->out_buf_ofs += copied_to_caller;
+        pState->comp_remaining -= copied_to_caller;
+    }
+    else
+    {
+        do
+        {
+            /* Calc ptr to write buffer - given current output pos and block size */
+            mz_uint8 *pWrite_buf_cur = (mz_uint8 *)pState->pWrite_buf + (pState->out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+
+            /* Calc max output size - given current output pos and block size */
+            size_t in_buf_size, out_buf_size = TINFL_LZ_DICT_SIZE - (pState->out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+
+            if (!pState->out_blk_remain)
+            {
+                /* Read more data from file if none available (and reading from file) */
+                if ((!pState->read_buf_avail) && (!pState->pZip->m_pState->m_pMem))
+                {
+                    /* Calc read size */
+                    pState->read_buf_avail = MZ_MIN(pState->read_buf_size, pState->comp_remaining);
+                    if (pState->pZip->m_pRead(pState->pZip->m_pIO_opaque, pState->cur_file_ofs, pState->pRead_buf, (size_t)pState->read_buf_avail) != pState->read_buf_avail)
+                    {
+                        mz_zip_set_error(pState->pZip, MZ_ZIP_FILE_READ_FAILED);
+                        pState->status = TINFL_STATUS_FAILED;
+                        break;
+                    }
+
+                    /* Advance offsets, dec counters */
+                    pState->cur_file_ofs += pState->read_buf_avail;
+                    pState->comp_remaining -= pState->read_buf_avail;
+                    pState->read_buf_ofs = 0;
+                }
+
+                /* Perform decompression */
+                in_buf_size = (size_t)pState->read_buf_avail;
+                pState->status = tinfl_decompress(&pState->inflator, (const mz_uint8 *)pState->pRead_buf + pState->read_buf_ofs, &in_buf_size, (mz_uint8 *)pState->pWrite_buf, pWrite_buf_cur, &out_buf_size, pState->comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0);
+                pState->read_buf_avail -= in_buf_size;
+                pState->read_buf_ofs += in_buf_size;
+
+                /* Update current output block size remaining */
+                pState->out_blk_remain = out_buf_size;
+            }
+
+            if (pState->out_blk_remain)
+            {
+                /* Calc amount to return. */
+                size_t to_copy = MZ_MIN( (buf_size - copied_to_caller), pState->out_blk_remain );
+
+                /* Copy data to caller's buffer */
+                memcpy( (mz_uint8*)pvBuf + copied_to_caller, pWrite_buf_cur, to_copy );
+
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+                /* Perform CRC */
+                pState->file_crc32 = (mz_uint32)mz_crc32(pState->file_crc32, pWrite_buf_cur, to_copy);
+#endif
+
+                /* Decrement data consumed from block */
+                pState->out_blk_remain -= to_copy;
+
+                /* Inc output offset, while performing sanity check */
+                if ((pState->out_buf_ofs += to_copy) > pState->file_stat.m_uncomp_size)
+                {
+                    mz_zip_set_error(pState->pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+                    pState->status = TINFL_STATUS_FAILED;
+                    break;
+                }
+
+                /* Increment counter of data copied to caller */
+                copied_to_caller += to_copy;
+            }
+        } while ( (copied_to_caller < buf_size) && ((pState->status == TINFL_STATUS_NEEDS_MORE_INPUT) || (pState->status == TINFL_STATUS_HAS_MORE_OUTPUT)) );
+    }
+
+    /* Return how many bytes were copied into user buffer */
+    return copied_to_caller;
+}
+
+mz_bool mz_zip_reader_extract_iter_free(mz_zip_reader_extract_iter_state* pState)
+{
+    int status;
+
+    /* Argument sanity check */
+    if ((!pState) || (!pState->pZip) || (!pState->pZip->m_pState))
+        return MZ_FALSE;
+
+    /* Was decompression completed and requested? */
+    if ((pState->status == TINFL_STATUS_DONE) && (!(pState->flags & MZ_ZIP_FLAG_COMPRESSED_DATA)))
+    {
+        /* Make sure the entire file was decompressed, and check its CRC. */
+        if (pState->out_buf_ofs != pState->file_stat.m_uncomp_size)
+        {
+            mz_zip_set_error(pState->pZip, MZ_ZIP_UNEXPECTED_DECOMPRESSED_SIZE);
+            pState->status = TINFL_STATUS_FAILED;
+        }
+#ifndef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+        else if (pState->file_crc32 != pState->file_stat.m_crc32)
+        {
+            mz_zip_set_error(pState->pZip, MZ_ZIP_DECOMPRESSION_FAILED);
+            pState->status = TINFL_STATUS_FAILED;
+        }
+#endif
+    }
+
+    /* Free buffers */
+    if (!pState->pZip->m_pState->m_pMem)
+        pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState->pRead_buf);
+    if (pState->pWrite_buf)
+        pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState->pWrite_buf);
+
+    /* Save status */
+    status = pState->status;
+
+    /* Free context */
+    pState->pZip->m_pFree(pState->pZip->m_pAlloc_opaque, pState);
+
+    return status == TINFL_STATUS_DONE;
+}
+
 #ifndef MINIZ_NO_STDIO
 static size_t mz_zip_file_write_callback(void *pOpaque, mz_uint64 ofs, const void *pBuf, size_t n)
 {
@@ -4851,7 +5376,10 @@ mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint f
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
 
     if (!mz_zip_array_resize(pZip, &file_data_array, MZ_MAX(local_header_filename_len, local_header_extra_len), MZ_FALSE))
-        return mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+    {
+        mz_zip_set_error(pZip, MZ_ZIP_ALLOC_FAILED);
+        goto handle_failure;
+    }
 
     if (local_header_filename_len)
     {
@@ -4871,8 +5399,8 @@ mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint f
 
     if ((local_header_extra_len) && ((local_header_comp_size == MZ_UINT32_MAX) || (local_header_uncomp_size == MZ_UINT32_MAX)))
     {
-		mz_uint32 extra_size_remaining = local_header_extra_len;
-		const mz_uint8 *pExtra_data = (const mz_uint8 *)file_data_array.m_p;
+        mz_uint32 extra_size_remaining = local_header_extra_len;
+        const mz_uint8 *pExtra_data = (const mz_uint8 *)file_data_array.m_p;
 
         if (pZip->m_pRead(pZip->m_pIO_opaque, local_header_ofs + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + local_header_filename_len, file_data_array.m_p, local_header_extra_len) != local_header_extra_len)
         {
@@ -4885,14 +5413,20 @@ mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint f
             mz_uint32 field_id, field_data_size, field_total_size;
 
             if (extra_size_remaining < (sizeof(mz_uint16) * 2))
-                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                goto handle_failure;
+            }
 
             field_id = MZ_READ_LE16(pExtra_data);
             field_data_size = MZ_READ_LE16(pExtra_data + sizeof(mz_uint16));
             field_total_size = field_data_size + sizeof(mz_uint16) * 2;
 
             if (field_total_size > extra_size_remaining)
-                return mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+            {
+                mz_zip_set_error(pZip, MZ_ZIP_INVALID_HEADER_OR_CORRUPTED);
+                goto handle_failure;
+            }
 
             if (field_id == MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID)
             {
@@ -4921,10 +5455,10 @@ mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint f
     if ((has_data_descriptor) && (!local_header_comp_size) && (!local_header_crc32))
     {
         mz_uint8 descriptor_buf[32];
-		mz_bool has_id;
-		const mz_uint8 *pSrc;
-		mz_uint32 file_crc32;
-		mz_uint64 comp_size = 0, uncomp_size = 0;
+        mz_bool has_id;
+        const mz_uint8 *pSrc;
+        mz_uint32 file_crc32;
+        mz_uint64 comp_size = 0, uncomp_size = 0;
 
         mz_uint32 num_descriptor_uint32s = ((pState->m_zip64) || (found_zip64_ext_data_in_ldir)) ? 6 : 4;
 
@@ -4935,9 +5469,9 @@ mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint f
         }
 
         has_id = (MZ_READ_LE32(descriptor_buf) == MZ_ZIP_DATA_DESCRIPTOR_ID);
-		pSrc = has_id ? (descriptor_buf + sizeof(mz_uint32)) : descriptor_buf;
+        pSrc = has_id ? (descriptor_buf + sizeof(mz_uint32)) : descriptor_buf;
 
-        file_crc32 = MZ_READ_LE32(pSrc);        
+        file_crc32 = MZ_READ_LE32(pSrc);
 
         if ((pState->m_zip64) || (found_zip64_ext_data_in_ldir))
         {
@@ -4990,7 +5524,7 @@ handle_failure:
 mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags)
 {
     mz_zip_internal_state *pState;
-    uint32_t i;
+    mz_uint32 i;
 
     if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) || (!pZip->m_pRead))
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
@@ -5008,9 +5542,6 @@ mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags)
     }
     else
     {
-        if (pZip->m_total_files >= MZ_UINT32_MAX)
-            return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
-
         if (pState->m_central_dir.m_size >= MZ_UINT32_MAX)
             return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
     }
@@ -5291,7 +5822,7 @@ mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size)
 mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size, mz_uint flags)
 {
     pZip->m_pWrite = mz_zip_heap_write_func;
-	pZip->m_pNeeds_keepalive = NULL;
+    pZip->m_pNeeds_keepalive = NULL;
 
     if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
         pZip->m_pRead = mz_zip_mem_read_func;
@@ -5348,7 +5879,7 @@ mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename,
     MZ_FILE *pFile;
 
     pZip->m_pWrite = mz_zip_file_write_func;
-	pZip->m_pNeeds_keepalive = NULL;
+    pZip->m_pNeeds_keepalive = NULL;
 
     if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
         pZip->m_pRead = mz_zip_file_read_func;
@@ -5372,7 +5903,7 @@ mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename,
         mz_uint64 cur_ofs = 0;
         char buf[4096];
 
-        MZ_CLEAR_OBJ(buf);
+        MZ_CLEAR_ARR(buf);
 
         do
         {
@@ -5393,7 +5924,7 @@ mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename,
 mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint flags)
 {
     pZip->m_pWrite = mz_zip_file_write_func;
-	pZip->m_pNeeds_keepalive = NULL;
+    pZip->m_pNeeds_keepalive = NULL;
 
     if (flags & MZ_ZIP_FLAG_WRITE_ALLOW_READING)
         pZip->m_pRead = mz_zip_file_read_func;
@@ -5466,7 +5997,7 @@ mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFil
         }
 
         pZip->m_pWrite = mz_zip_file_write_func;
-		pZip->m_pNeeds_keepalive = NULL;
+        pZip->m_pNeeds_keepalive = NULL;
 #endif /* #ifdef MINIZ_NO_STDIO */
     }
     else if (pState->m_pMem)
@@ -5477,7 +6008,7 @@ mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFil
 
         pState->m_mem_capacity = pState->m_mem_size;
         pZip->m_pWrite = mz_zip_heap_write_func;
-		pZip->m_pNeeds_keepalive = NULL;
+        pZip->m_pNeeds_keepalive = NULL;
     }
     /* Archive is being read via a user provided read function - make sure the user has specified a write function too. */
     else if (!pZip->m_pWrite)
@@ -5532,7 +6063,7 @@ static mz_bool mz_zip_writer_add_put_buf_callback(const void *pBuf, int len, voi
 static mz_uint32 mz_zip_writer_create_zip64_extra_data(mz_uint8 *pBuf, mz_uint64 *pUncomp_size, mz_uint64 *pComp_size, mz_uint64 *pLocal_header_ofs)
 {
     mz_uint8 *pDst = pBuf;
-	mz_uint32 field_size = 0;
+    mz_uint32 field_size = 0;
 
     MZ_WRITE_LE16(pDst + 0, MZ_ZIP64_EXTENDED_INFORMATION_FIELD_HEADER_ID);
     MZ_WRITE_LE16(pDst + 2, 0);
@@ -5629,7 +6160,7 @@ static mz_bool mz_zip_writer_add_to_central_dir(mz_zip_archive *pZip, const char
     if (((mz_uint64)pState->m_central_dir.m_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size + extra_size + user_extra_data_len + comment_size) >= MZ_UINT32_MAX)
         return mz_zip_set_error(pZip, MZ_ZIP_UNSUPPORTED_CDIR_SIZE);
 
-    if (!mz_zip_writer_create_central_dir_header(pZip, central_dir_header, filename_size, extra_size + user_extra_data_len, comment_size, uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time, dos_date, local_header_ofs, ext_attributes))
+    if (!mz_zip_writer_create_central_dir_header(pZip, central_dir_header, filename_size, (mz_uint16)(extra_size + user_extra_data_len), comment_size, uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time, dos_date, local_header_ofs, ext_attributes))
         return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
 
     if ((!mz_zip_array_push_back(pZip, &pState->m_central_dir, central_dir_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)) ||
@@ -5653,13 +6184,7 @@ static mz_bool mz_zip_writer_validate_archive_name(const char *pArchive_name)
     if (*pArchive_name == '/')
         return MZ_FALSE;
 
-    while (*pArchive_name)
-    {
-        if ((*pArchive_name == '\\') || (*pArchive_name == ':'))
-            return MZ_FALSE;
-
-        pArchive_name++;
-    }
+    /* Making sure the name does not contain drive letters or DOS style backward slashes is the responsibility of the program using miniz*/
 
     return MZ_TRUE;
 }
@@ -5712,14 +6237,15 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
     mz_uint16 bit_flags = 0;
 
+    if ((int)level_and_flags < 0)
+        level_and_flags = MZ_DEFAULT_LEVEL;
+
     if (uncomp_size || (buf_size && !(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)))
         bit_flags |= MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR;
 
     if (!(level_and_flags & MZ_ZIP_FLAG_ASCII_FILENAME))
         bit_flags |= MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8;
 
-    if ((int)level_and_flags < 0)
-        level_and_flags = MZ_DEFAULT_LEVEL;
     level = level_and_flags & 0xF;
     store_data_uncompressed = ((!level) || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA));
 
@@ -5740,7 +6266,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
             pState->m_zip64 = MZ_TRUE;
             /*return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES); */
         }
-        if ((buf_size > 0xFFFFFFFF) || (uncomp_size > 0xFFFFFFFF))
+        if (((mz_uint64)buf_size > 0xFFFFFFFF) || (uncomp_size > 0xFFFFFFFF))
         {
             pState->m_zip64 = MZ_TRUE;
             /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
@@ -5760,12 +6286,23 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     }
     else
     {
-		MZ_TIME_T cur_time;
-		time(&cur_time);
-		mz_zip_time_t_to_dos_time(cur_time, &dos_time, &dos_date);
+        MZ_TIME_T cur_time;
+        time(&cur_time);
+        mz_zip_time_t_to_dos_time(cur_time, &dos_time, &dos_date);
     }
 #endif /* #ifndef MINIZ_NO_TIME */
 
+	if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+	{
+		uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
+		uncomp_size = buf_size;
+		if (uncomp_size <= 3)
+		{
+			level = 0;
+			store_data_uncompressed = MZ_TRUE;
+		}
+	}
+
     archive_name_size = strlen(pArchive_name);
     if (archive_name_size > MZ_UINT16_MAX)
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_FILENAME);
@@ -5779,7 +6316,10 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     if (!pState->m_zip64)
     {
         /* Bail early if the archive would obviously become too large */
-        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) > 0xFFFFFFFF)
+        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size
+			+ MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len +
+			pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + user_extra_data_central_len
+			+ MZ_ZIP_DATA_DESCRIPTER_SIZE32) > 0xFFFFFFFF)
         {
             pState->m_zip64 = MZ_TRUE;
             /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
@@ -5819,7 +6359,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     }
     cur_archive_file_ofs += num_alignment_padding_bytes;
 
-    MZ_CLEAR_OBJ(local_dir_header);
+    MZ_CLEAR_ARR(local_dir_header);
 
     if (!store_data_uncompressed || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
     {
@@ -5835,7 +6375,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
                                                                (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
         }
 
-        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, extra_size + user_extra_data_len, 0, 0, 0, method, bit_flags, dos_time, dos_date))
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len), 0, 0, 0, method, bit_flags, dos_time, dos_date))
             return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
 
         if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
@@ -5862,7 +6402,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     {
         if ((comp_size > MZ_UINT32_MAX) || (cur_archive_file_ofs > MZ_UINT32_MAX))
             return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
-        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, user_extra_data_len, 0, 0, 0, method, bit_flags, dos_time, dos_date))
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)user_extra_data_len, 0, 0, 0, method, bit_flags, dos_time, dos_date))
             return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
 
         if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
@@ -5878,24 +6418,13 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
         cur_archive_file_ofs += archive_name_size;
     }
 
-    if (user_extra_data_len > 0)
-    {
-        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, user_extra_data, user_extra_data_len) != user_extra_data_len)
-            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+	if (user_extra_data_len > 0)
+	{
+		if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, user_extra_data, user_extra_data_len) != user_extra_data_len)
+			return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
 
-        cur_archive_file_ofs += user_extra_data_len;
-    }
-
-    if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
-    {
-        uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
-        uncomp_size = buf_size;
-        if (uncomp_size <= 3)
-        {
-            level = 0;
-            store_data_uncompressed = MZ_TRUE;
-        }
-    }
+		cur_archive_file_ofs += user_extra_data_len;
+	}
 
     if (store_data_uncompressed)
     {
@@ -5932,8 +6461,8 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
 
     if (uncomp_size)
     {
-		mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
-		mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
+        mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
+        mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
 
         MZ_ASSERT(bit_flags & MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR);
 
@@ -5941,7 +6470,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
         MZ_WRITE_LE32(local_dir_footer + 4, uncomp_crc32);
         if (pExtra_data == NULL)
         {
-            if ((comp_size > MZ_UINT32_MAX) || (cur_archive_file_ofs > MZ_UINT32_MAX))
+            if (comp_size > MZ_UINT32_MAX)
                 return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
 
             MZ_WRITE_LE32(local_dir_footer + 8, comp_size);
@@ -5966,7 +6495,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
                                                            (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
     }
 
-    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, extra_size, pComment,
+    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, (mz_uint16)extra_size, pComment,
                                           comment_size, uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time, dos_date, local_dir_header_ofs, ext_attributes,
                                           user_extra_data_central, user_extra_data_central_len))
         return MZ_FALSE;
@@ -5977,35 +6506,37 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     return MZ_TRUE;
 }
 
-#ifndef MINIZ_NO_STDIO
-mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 size_to_add, const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+mz_bool mz_zip_writer_add_read_buf_callback(mz_zip_archive *pZip, const char *pArchive_name, mz_file_read_func read_callback, void* callback_opaque, mz_uint64 max_size, const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
                                 const char *user_extra_data, mz_uint user_extra_data_len, const char *user_extra_data_central, mz_uint user_extra_data_central_len)
 {
-    mz_uint16 gen_flags = MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR;
+    mz_uint16 gen_flags;
     mz_uint uncomp_crc32 = MZ_CRC32_INIT, level, num_alignment_padding_bytes;
     mz_uint16 method = 0, dos_time = 0, dos_date = 0, ext_attributes = 0;
-    mz_uint64 local_dir_header_ofs, cur_archive_file_ofs = pZip->m_archive_size, uncomp_size = size_to_add, comp_size = 0;
+    mz_uint64 local_dir_header_ofs, cur_archive_file_ofs = pZip->m_archive_size, uncomp_size = 0, comp_size = 0;
     size_t archive_name_size;
     mz_uint8 local_dir_header[MZ_ZIP_LOCAL_DIR_HEADER_SIZE];
     mz_uint8 *pExtra_data = NULL;
     mz_uint32 extra_size = 0;
     mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
     mz_zip_internal_state *pState;
-
-    if (!(level_and_flags & MZ_ZIP_FLAG_ASCII_FILENAME))
-        gen_flags |= MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8;
+    mz_uint64 file_ofs = 0, cur_archive_header_file_ofs;
 
     if ((int)level_and_flags < 0)
         level_and_flags = MZ_DEFAULT_LEVEL;
     level = level_and_flags & 0xF;
 
+    gen_flags = (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE) ? 0 : MZ_ZIP_LDH_BIT_FLAG_HAS_LOCATOR;
+
+    if (!(level_and_flags & MZ_ZIP_FLAG_ASCII_FILENAME))
+        gen_flags |= MZ_ZIP_GENERAL_PURPOSE_BIT_FLAG_UTF8;
+
     /* Sanity checks */
     if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
 
     pState = pZip->m_pState;
 
-    if ((!pState->m_zip64) && (uncomp_size > MZ_UINT32_MAX))
+    if ((!pState->m_zip64) && (max_size > MZ_UINT32_MAX))
     {
         /* Source file is too large for non-zip64 */
         /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
@@ -6046,7 +6577,9 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
     if (!pState->m_zip64)
     {
         /* Bail early if the archive would obviously become too large */
-        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + 1024) > 0xFFFFFFFF)
+        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE
+			+ archive_name_size + comment_size + user_extra_data_len + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + 1024
+			+ MZ_ZIP_DATA_DESCRIPTER_SIZE32 + user_extra_data_central_len) > 0xFFFFFFFF)
         {
             pState->m_zip64 = MZ_TRUE;
             /*return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE); */
@@ -6060,7 +6593,7 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
     }
 #endif
 
-    if (uncomp_size <= 3)
+    if (max_size <= 3)
         level = 0;
 
     if (!mz_zip_writer_write_zeros(pZip, cur_archive_file_ofs, num_alignment_padding_bytes))
@@ -6076,22 +6609,28 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
         MZ_ASSERT((cur_archive_file_ofs & (pZip->m_file_offset_alignment - 1)) == 0);
     }
 
-    if (uncomp_size && level)
+    if (max_size && level)
     {
         method = MZ_DEFLATED;
     }
 
-    MZ_CLEAR_OBJ(local_dir_header);
+    MZ_CLEAR_ARR(local_dir_header);
     if (pState->m_zip64)
     {
-        if (uncomp_size >= MZ_UINT32_MAX || local_dir_header_ofs >= MZ_UINT32_MAX)
+        if (max_size >= MZ_UINT32_MAX || local_dir_header_ofs >= MZ_UINT32_MAX)
         {
             pExtra_data = extra_data;
-            extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (uncomp_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
-                                                               (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+            if (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE)
+                extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (max_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                               (max_size >= MZ_UINT32_MAX) ? &comp_size : NULL,
+                                                                (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+            else
+                extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, NULL,
+                                                                   NULL,
+                                                                   (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
         }
 
-        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, extra_size + user_extra_data_len, 0, 0, 0, method, gen_flags, dos_time, dos_date))
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len), 0, 0, 0, method, gen_flags, dos_time, dos_date))
             return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
 
         if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
@@ -6115,7 +6654,7 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
     {
         if ((comp_size > MZ_UINT32_MAX) || (cur_archive_file_ofs > MZ_UINT32_MAX))
             return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
-        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, user_extra_data_len, 0, 0, 0, method, gen_flags, dos_time, dos_date))
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header, (mz_uint16)archive_name_size, (mz_uint16)user_extra_data_len, 0, 0, 0, method, gen_flags, dos_time, dos_date))
             return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
 
         if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
@@ -6139,9 +6678,8 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
         cur_archive_file_ofs += user_extra_data_len;
     }
 
-    if (uncomp_size)
+    if (max_size)
     {
-        mz_uint64 uncomp_remaining = uncomp_size;
         void *pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, MZ_ZIP_MAX_IO_BUF_SIZE);
         if (!pRead_buf)
         {
@@ -6150,18 +6688,27 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
 
         if (!level)
         {
-            while (uncomp_remaining)
+            while (1)
             {
-                mz_uint n = (mz_uint)MZ_MIN((mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE, uncomp_remaining);
-                if ((MZ_FREAD(pRead_buf, 1, n, pSrc_file) != n) || (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pRead_buf, n) != n))
+                size_t n = read_callback(callback_opaque, file_ofs, pRead_buf, MZ_ZIP_MAX_IO_BUF_SIZE);
+                if (n == 0)
+                    break;
+
+                if ((n > MZ_ZIP_MAX_IO_BUF_SIZE) || (file_ofs + n > max_size))
                 {
                     pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
                     return mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
                 }
+                if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pRead_buf, n) != n)
+                {
+                    pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+                    return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+                }
+                file_ofs += n;
                 uncomp_crc32 = (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, n);
-                uncomp_remaining -= n;
                 cur_archive_file_ofs += n;
             }
+            uncomp_size = file_ofs;
             comp_size = uncomp_size;
         }
         else
@@ -6188,23 +6735,26 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
 
             for (;;)
             {
-                size_t in_buf_size = (mz_uint32)MZ_MIN(uncomp_remaining, (mz_uint64)MZ_ZIP_MAX_IO_BUF_SIZE);
                 tdefl_status status;
-				tdefl_flush flush = TDEFL_NO_FLUSH;
+                tdefl_flush flush = TDEFL_NO_FLUSH;
 
-                if (MZ_FREAD(pRead_buf, 1, in_buf_size, pSrc_file) != in_buf_size)
+                size_t n = read_callback(callback_opaque, file_ofs, pRead_buf, MZ_ZIP_MAX_IO_BUF_SIZE);
+                if ((n > MZ_ZIP_MAX_IO_BUF_SIZE) || (file_ofs + n > max_size))
                 {
                     mz_zip_set_error(pZip, MZ_ZIP_FILE_READ_FAILED);
                     break;
                 }
 
-                uncomp_crc32 = (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, in_buf_size);
-                uncomp_remaining -= in_buf_size;
+                file_ofs += n;
+                uncomp_crc32 = (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, n);
 
-				if (pZip->m_pNeeds_keepalive != NULL && pZip->m_pNeeds_keepalive(pZip->m_pIO_opaque))
-					flush = TDEFL_FULL_FLUSH;
+                if (pZip->m_pNeeds_keepalive != NULL && pZip->m_pNeeds_keepalive(pZip->m_pIO_opaque))
+                    flush = TDEFL_FULL_FLUSH;
 
-                status = tdefl_compress_buffer(pComp, pRead_buf, in_buf_size, uncomp_remaining ? flush : TDEFL_FINISH);
+                if (n == 0)
+                    flush = TDEFL_FINISH;
+
+                status = tdefl_compress_buffer(pComp, pRead_buf, n, flush);
                 if (status == TDEFL_STATUS_DONE)
                 {
                     result = MZ_TRUE;
@@ -6225,6 +6775,7 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
                 return MZ_FALSE;
             }
 
+            uncomp_size = file_ofs;
             comp_size = state.m_comp_size;
             cur_archive_file_ofs = state.m_cur_archive_file_ofs;
         }
@@ -6232,32 +6783,71 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
         pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
     }
 
-	{
-		mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
-		mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
+    if (!(level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE))
+    {
+        mz_uint8 local_dir_footer[MZ_ZIP_DATA_DESCRIPTER_SIZE64];
+        mz_uint32 local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE32;
 
-		MZ_WRITE_LE32(local_dir_footer + 0, MZ_ZIP_DATA_DESCRIPTOR_ID);
-		MZ_WRITE_LE32(local_dir_footer + 4, uncomp_crc32);
-		if (pExtra_data == NULL)
-		{
-			if (comp_size > MZ_UINT32_MAX)
-				return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
+        MZ_WRITE_LE32(local_dir_footer + 0, MZ_ZIP_DATA_DESCRIPTOR_ID);
+        MZ_WRITE_LE32(local_dir_footer + 4, uncomp_crc32);
+        if (pExtra_data == NULL)
+        {
+            if (comp_size > MZ_UINT32_MAX)
+                return mz_zip_set_error(pZip, MZ_ZIP_ARCHIVE_TOO_LARGE);
 
-			MZ_WRITE_LE32(local_dir_footer + 8, comp_size);
-			MZ_WRITE_LE32(local_dir_footer + 12, uncomp_size);
-		}
-		else
-		{
-			MZ_WRITE_LE64(local_dir_footer + 8, comp_size);
-			MZ_WRITE_LE64(local_dir_footer + 16, uncomp_size);
-			local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE64;
-		}
+            MZ_WRITE_LE32(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE32(local_dir_footer + 12, uncomp_size);
+        }
+        else
+        {
+            MZ_WRITE_LE64(local_dir_footer + 8, comp_size);
+            MZ_WRITE_LE64(local_dir_footer + 16, uncomp_size);
+            local_dir_footer_size = MZ_ZIP_DATA_DESCRIPTER_SIZE64;
+        }
 
-		if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_footer, local_dir_footer_size) != local_dir_footer_size)
-			return MZ_FALSE;
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, local_dir_footer, local_dir_footer_size) != local_dir_footer_size)
+            return MZ_FALSE;
 
-		cur_archive_file_ofs += local_dir_footer_size;
-	}
+        cur_archive_file_ofs += local_dir_footer_size;
+    }
+
+    if (level_and_flags & MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE)
+    {
+        if (pExtra_data != NULL)
+        {
+            extra_size = mz_zip_writer_create_zip64_extra_data(extra_data, (max_size >= MZ_UINT32_MAX) ? &uncomp_size : NULL,
+                                                               (max_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
+        }
+
+        if (!mz_zip_writer_create_local_dir_header(pZip, local_dir_header,
+                                                   (mz_uint16)archive_name_size, (mz_uint16)(extra_size + user_extra_data_len),
+                                                   (max_size >= MZ_UINT32_MAX) ? MZ_UINT32_MAX : uncomp_size, 
+                                                    (max_size >= MZ_UINT32_MAX) ? MZ_UINT32_MAX : comp_size,
+                                                   uncomp_crc32, method, gen_flags, dos_time, dos_date))
+            return mz_zip_set_error(pZip, MZ_ZIP_INTERNAL_ERROR);
+
+        cur_archive_header_file_ofs = local_dir_header_ofs;
+
+        if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, local_dir_header, sizeof(local_dir_header)) != sizeof(local_dir_header))
+            return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+        if (pExtra_data != NULL)
+        {
+            cur_archive_header_file_ofs += sizeof(local_dir_header);
+
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, pArchive_name, archive_name_size) != archive_name_size)
+            {
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+            }
+
+            cur_archive_header_file_ofs += archive_name_size;
+
+            if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_header_file_ofs, extra_data, extra_size) != extra_size)
+                return mz_zip_set_error(pZip, MZ_ZIP_FILE_WRITE_FAILED);
+
+            cur_archive_header_file_ofs += extra_size;
+        }
+    }
 
     if (pExtra_data != NULL)
     {
@@ -6265,7 +6855,7 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
                                                            (uncomp_size >= MZ_UINT32_MAX) ? &comp_size : NULL, (local_dir_header_ofs >= MZ_UINT32_MAX) ? &local_dir_header_ofs : NULL);
     }
 
-    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, extra_size, pComment, comment_size,
+    if (!mz_zip_writer_add_to_central_dir(pZip, pArchive_name, (mz_uint16)archive_name_size, pExtra_data, (mz_uint16)extra_size, pComment, comment_size,
                                           uncomp_size, comp_size, uncomp_crc32, method, gen_flags, dos_time, dos_date, local_dir_header_ofs, ext_attributes,
                                           user_extra_data_central, user_extra_data_central_len))
         return MZ_FALSE;
@@ -6276,13 +6866,33 @@ mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name,
     return MZ_TRUE;
 }
 
+#ifndef MINIZ_NO_STDIO
+
+static size_t mz_file_read_func_stdio(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n)
+{
+	MZ_FILE *pSrc_file = (MZ_FILE *)pOpaque;
+	mz_int64 cur_ofs = MZ_FTELL64(pSrc_file);
+
+	if (((mz_int64)file_ofs < 0) || (((cur_ofs != (mz_int64)file_ofs)) && (MZ_FSEEK64(pSrc_file, (mz_int64)file_ofs, SEEK_SET))))
+		return 0;
+
+	return MZ_FREAD(pBuf, 1, n, pSrc_file);
+}
+
+mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 max_size, const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+	const char *user_extra_data, mz_uint user_extra_data_len, const char *user_extra_data_central, mz_uint user_extra_data_central_len)
+{
+	return mz_zip_writer_add_read_buf_callback(pZip, pArchive_name, mz_file_read_func_stdio, pSrc_file, max_size, pFile_time, pComment, comment_size, level_and_flags,
+		user_extra_data, user_extra_data_len, user_extra_data_central, user_extra_data_central_len);
+}
+
 mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name, const char *pSrc_filename, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags)
 {
     MZ_FILE *pSrc_file = NULL;
     mz_uint64 uncomp_size = 0;
     MZ_TIME_T file_modified_time;
     MZ_TIME_T *pFile_time = NULL;
-	mz_bool status;
+    mz_bool status;
 
     memset(&file_modified_time, 0, sizeof(file_modified_time));
 
@@ -6308,7 +6918,7 @@ mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name,
 }
 #endif /* #ifndef MINIZ_NO_STDIO */
 
-static mz_bool mz_zip_writer_update_zip64_extension_block(mz_zip_array *pNew_ext, mz_zip_archive *pZip, const mz_uint8 *pExt, uint32_t ext_len, mz_uint64 *pComp_size, mz_uint64 *pUncomp_size, mz_uint64 *pLocal_header_ofs, mz_uint32 *pDisk_start)
+static mz_bool mz_zip_writer_update_zip64_extension_block(mz_zip_array *pNew_ext, mz_zip_archive *pZip, const mz_uint8 *pExt, mz_uint32 ext_len, mz_uint64 *pComp_size, mz_uint64 *pUncomp_size, mz_uint64 *pLocal_header_ofs, mz_uint32 *pDisk_start)
 {
     /* + 64 should be enough for any new zip64 data */
     if (!mz_zip_array_reserve(pZip, pNew_ext, ext_len + 64, MZ_FALSE))
@@ -6466,14 +7076,14 @@ mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *
     local_header_extra_len = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
     local_header_comp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_COMPRESSED_SIZE_OFS);
     local_header_uncomp_size = MZ_READ_LE32(pLocal_header + MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS);
-    src_archive_bytes_remaining = local_header_filename_size + local_header_extra_len + src_file_stat.m_comp_size;
+    src_archive_bytes_remaining = src_file_stat.m_comp_size + local_header_filename_size + local_header_extra_len ;
 
     /* Try to find a zip64 extended information field */
     if ((local_header_extra_len) && ((local_header_comp_size == MZ_UINT32_MAX) || (local_header_uncomp_size == MZ_UINT32_MAX)))
     {
         mz_zip_array file_data_array;
-		const mz_uint8 *pExtra_data;
-		mz_uint32 extra_size_remaining = local_header_extra_len;
+        const mz_uint8 *pExtra_data;
+        mz_uint32 extra_size_remaining = local_header_extra_len;
 
         mz_zip_array_init(&file_data_array, 1);
         if (!mz_zip_array_resize(pZip, &file_data_array, local_header_extra_len, MZ_FALSE))
@@ -6624,10 +7234,10 @@ mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *
             if (pZip->m_pState->m_zip64)
             {
                 /* dest is zip64, so upgrade the data descriptor */
-                const mz_uint32 *pSrc_descriptor = (const mz_uint32 *)((const mz_uint8 *)pBuf + (has_id ? sizeof(mz_uint32) : 0));
-                const mz_uint32 src_crc32 = pSrc_descriptor[0];
-                const mz_uint64 src_comp_size = pSrc_descriptor[1];
-                const mz_uint64 src_uncomp_size = pSrc_descriptor[2];
+                const mz_uint8 *pSrc_descriptor = (const mz_uint8 *)pBuf + (has_id ? sizeof(mz_uint32) : 0);
+                const mz_uint32 src_crc32 = MZ_READ_LE32(pSrc_descriptor);
+                const mz_uint64 src_comp_size = MZ_READ_LE32(pSrc_descriptor + sizeof(mz_uint32));
+                const mz_uint64 src_uncomp_size = MZ_READ_LE32(pSrc_descriptor + 2*sizeof(mz_uint32));
 
                 mz_write_le32((mz_uint8 *)pBuf, MZ_ZIP_DATA_DESCRIPTOR_ID);
                 mz_write_le32((mz_uint8 *)pBuf + sizeof(mz_uint32) * 1, src_crc32);
@@ -6763,7 +7373,7 @@ mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
 
     if (pState->m_zip64)
     {
-        if ((pZip->m_total_files > MZ_UINT32_MAX) || (pState->m_central_dir.m_size >= MZ_UINT32_MAX))
+        if ((mz_uint64)pState->m_central_dir.m_size >= MZ_UINT32_MAX)
             return mz_zip_set_error(pZip, MZ_ZIP_TOO_MANY_FILES);
     }
     else
@@ -6791,7 +7401,7 @@ mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
         /* Write zip64 end of central directory header */
         mz_uint64 rel_ofs_to_zip64_ecdr = pZip->m_archive_size;
 
-        MZ_CLEAR_OBJ(hdr);
+        MZ_CLEAR_ARR(hdr);
         MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDH_SIG_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIG);
         MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDH_SIZE_OF_RECORD_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE - sizeof(mz_uint32) - sizeof(mz_uint64));
         MZ_WRITE_LE16(hdr + MZ_ZIP64_ECDH_VERSION_MADE_BY_OFS, 0x031E); /* TODO: always Unix */
@@ -6806,7 +7416,7 @@ mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
         pZip->m_archive_size += MZ_ZIP64_END_OF_CENTRAL_DIR_HEADER_SIZE;
 
         /* Write zip64 end of central directory locator */
-        MZ_CLEAR_OBJ(hdr);
+        MZ_CLEAR_ARR(hdr);
         MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDL_SIG_OFS, MZ_ZIP64_END_OF_CENTRAL_DIR_LOCATOR_SIG);
         MZ_WRITE_LE64(hdr + MZ_ZIP64_ECDL_REL_OFS_TO_ZIP64_ECDR_OFS, rel_ofs_to_zip64_ecdr);
         MZ_WRITE_LE32(hdr + MZ_ZIP64_ECDL_TOTAL_NUMBER_OF_DISKS_OFS, 1);
@@ -6817,7 +7427,7 @@ mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
     }
 
     /* Write end of central directory record */
-    MZ_CLEAR_OBJ(hdr);
+    MZ_CLEAR_ARR(hdr);
     MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_SIG_OFS, MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG);
     MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS, MZ_MIN(MZ_UINT16_MAX, pZip->m_total_files));
     MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS, MZ_MIN(MZ_UINT16_MAX, pZip->m_total_files));
@@ -7133,7 +7743,9 @@ const char *mz_zip_get_error_string(mz_zip_error mz_err)
         case MZ_ZIP_VALIDATION_FAILED:
             return "validation failed";
         case MZ_ZIP_WRITE_CALLBACK_FAILED:
-            return "write calledback failed";
+            return "write callback failed";
+	case MZ_ZIP_TOTAL_ERRORS:
+            return "total errors";
         default:
             break;
     }
@@ -7238,4 +7850,3 @@ mz_bool mz_zip_end(mz_zip_archive *pZip)
 #endif
 
 #endif /*#ifndef MINIZ_NO_ARCHIVE_APIS*/
-
diff --git a/Source/ThirdParty/OpenFBX/miniz.h b/Source/ThirdParty/tinyexr/miniz.h
similarity index 73%
rename from Source/ThirdParty/OpenFBX/miniz.h
rename to Source/ThirdParty/tinyexr/miniz.h
index ba253ac26..9fcfffcc8 100644
--- a/Source/ThirdParty/OpenFBX/miniz.h
+++ b/Source/ThirdParty/tinyexr/miniz.h
@@ -1,4 +1,7 @@
-/* miniz.c v1.16 beta r1 - public domain deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
+#ifndef MINIZ_EXPORT
+#define MINIZ_EXPORT
+#endif
+/* miniz.c 3.0.0 - public domain deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
    See "unlicense" statement at the end of this file.
    Rich Geldreich <richgel99@gmail.com>, last updated Oct. 13, 2013
    Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951: http://www.ietf.org/rfc/rfc1951.txt
@@ -24,7 +27,7 @@
      zlib replacement in many apps:
         The z_stream struct, optional memory allocation callbacks
         deflateInit/deflateInit2/deflate/deflateReset/deflateEnd/deflateBound
-        inflateInit/inflateInit2/inflate/inflateEnd
+        inflateInit/inflateInit2/inflate/inflateReset/inflateEnd
         compress, compress2, compressBound, uncompress
         CRC-32, Adler-32 - Using modern, minimal code size, CPU cache friendly routines.
         Supports raw deflate streams or standard zlib streams with adler-32 checking.
@@ -95,7 +98,7 @@
      possibility that the archive's central directory could be lost with this method if anything goes wrong, though.
 
      - ZIP archive support limitations:
-     No zip64 or spanning support. Extraction functions can only handle unencrypted, stored or deflated files.
+     No spanning support. Extraction functions can only handle unencrypted, stored or deflated files.
      Requires streams capable of seeking.
 
    * This is a header file library, like stb_image.c. To get only a header file, either cut and paste the
@@ -114,10 +117,8 @@
 
 
 
-
-
 /* Defines to completely disable specific portions of miniz.c: 
-   If all macros here are defined the only functionality remaining will be CRC-32, adler-32, tinfl, and tdefl. */
+   If all macros here are defined the only functionality remaining will be CRC-32 and adler-32. */
 
 /* Define MINIZ_NO_STDIO to disable all usage and any functions which rely on stdio for file I/O. */
 /*#define MINIZ_NO_STDIO */
@@ -127,6 +128,12 @@
 /* The current downside is the times written to your archives will be from 1979. */
 /*#define MINIZ_NO_TIME */
 
+/* Define MINIZ_NO_DEFLATE_APIS to disable all compression API's. */
+/*#define MINIZ_NO_DEFLATE_APIS */
+
+/* Define MINIZ_NO_INFLATE_APIS to disable all decompression API's. */
+/*#define MINIZ_NO_INFLATE_APIS */
+
 /* Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's. */
 /*#define MINIZ_NO_ARCHIVE_APIS */
 
@@ -145,6 +152,14 @@
    functions (such as tdefl_compress_mem_to_heap() and tinfl_decompress_mem_to_heap()) won't work. */
 /*#define MINIZ_NO_MALLOC */
 
+#ifdef MINIZ_NO_INFLATE_APIS
+#define MINIZ_NO_ARCHIVE_APIS
+#endif
+
+#ifdef MINIZ_NO_DEFLATE_APIS
+#define MINIZ_NO_ARCHIVE_WRITING_APIS
+#endif
+
 #if defined(__TINYC__) && (defined(__linux) || defined(__linux__))
 /* TODO: Work around "error: include file 'sys\utime.h' when compiling with tcc on Linux */
 #define MINIZ_NO_TIME
@@ -159,21 +174,55 @@
 #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__i386) || defined(__i486__) || defined(__i486) || defined(i386) || defined(__ia64__) || defined(__x86_64__)
 /* MINIZ_X86_OR_X64_CPU is only used to help set the below macros. */
 #define MINIZ_X86_OR_X64_CPU 1
+#else
+#define MINIZ_X86_OR_X64_CPU 0
 #endif
 
-#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+/* Set MINIZ_LITTLE_ENDIAN only if not set */
+#if !defined(MINIZ_LITTLE_ENDIAN)
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 /* Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian. */
 #define MINIZ_LITTLE_ENDIAN 1
+#else
+#define MINIZ_LITTLE_ENDIAN 0
 #endif
 
+#else
+
+#if MINIZ_X86_OR_X64_CPU
+#define MINIZ_LITTLE_ENDIAN 1
+#else
+#define MINIZ_LITTLE_ENDIAN 0
+#endif
+
+#endif
+#endif
+
+/* Using unaligned loads and stores causes errors when using UBSan */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 0
+#endif
+#endif
+
+/* Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES only if not set */
+#if !defined(MINIZ_USE_UNALIGNED_LOADS_AND_STORES)
 #if MINIZ_X86_OR_X64_CPU
 /* Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES to 1 on CPU's that permit efficient integer loads and stores from unaligned addresses. */
-#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 0
+#define MINIZ_UNALIGNED_USE_MEMCPY
+#else
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 0
+#endif
 #endif
 
 #if defined(_M_X64) || defined(_WIN64) || defined(__MINGW64__) || defined(_LP64) || defined(__LP64__) || defined(__ia64__) || defined(__x86_64__)
 /* Set MINIZ_HAS_64BIT_REGISTERS to 1 if operations on 64-bit integers are reasonably fast (and don't involve compiler generated calls to helper functions). */
 #define MINIZ_HAS_64BIT_REGISTERS 1
+#else
+#define MINIZ_HAS_64BIT_REGISTERS 0
 #endif
 
 #ifdef __cplusplus
@@ -186,15 +235,15 @@ extern "C" {
 typedef unsigned long mz_ulong;
 
 /* mz_free() internally uses the MZ_FREE() macro (which by default calls free() unless you've modified the MZ_MALLOC macro) to release a block allocated from the heap. */
-void mz_free(void *p);
+MINIZ_EXPORT void mz_free(void *p);
 
 #define MZ_ADLER32_INIT (1)
 /* mz_adler32() returns the initial adler-32 value to use when called with ptr==NULL. */
-mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
+MINIZ_EXPORT mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
 
 #define MZ_CRC32_INIT (0)
 /* mz_crc32() returns the initial CRC-32 value to use when called with ptr==NULL. */
-mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
+MINIZ_EXPORT mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
 
 /* Compression strategies. */
 enum
@@ -210,26 +259,26 @@ enum
 #define MZ_DEFLATED 8
 
 /* Heap allocation callbacks.
-Note that mz_alloc_func parameter types purpsosely differ from zlib's: items/size is size_t, not unsigned long. */
+Note that mz_alloc_func parameter types purposely differ from zlib's: items/size is size_t, not unsigned long. */
 typedef void *(*mz_alloc_func)(void *opaque, size_t items, size_t size);
-typedef void(*mz_free_func)(void *opaque, void *address);
+typedef void (*mz_free_func)(void *opaque, void *address);
 typedef void *(*mz_realloc_func)(void *opaque, void *address, size_t items, size_t size);
 
 /* Compression levels: 0-9 are the standard zlib-style levels, 10 is best possible compression (not zlib compatible, and may be very slow), MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL. */
 enum
 {
-	MZ_NO_COMPRESSION = 0,
-	MZ_BEST_SPEED = 1,
-	MZ_BEST_COMPRESSION = 9,
-	MZ_UBER_COMPRESSION = 10,
-	MZ_DEFAULT_LEVEL = 6,
-	MZ_DEFAULT_COMPRESSION = -1
+    MZ_NO_COMPRESSION = 0,
+    MZ_BEST_SPEED = 1,
+    MZ_BEST_COMPRESSION = 9,
+    MZ_UBER_COMPRESSION = 10,
+    MZ_DEFAULT_LEVEL = 6,
+    MZ_DEFAULT_COMPRESSION = -1
 };
 
-#define MZ_VERSION "10.0.0"
-#define MZ_VERNUM 0xA000
-#define MZ_VER_MAJOR 10
-#define MZ_VER_MINOR 0
+#define MZ_VERSION "11.0.2"
+#define MZ_VERNUM 0xB002
+#define MZ_VER_MAJOR 11
+#define MZ_VER_MINOR 2
 #define MZ_VER_REVISION 0
 #define MZ_VER_SUBREVISION 0
 
@@ -292,7 +341,9 @@ typedef struct mz_stream_s
 typedef mz_stream *mz_streamp;
 
 /* Returns the version string of miniz.c. */
-const char *mz_version(void);
+MINIZ_EXPORT const char *mz_version(void);
+
+#ifndef MINIZ_NO_DEFLATE_APIS
 
 /* mz_deflateInit() initializes a compressor with default options: */
 /* Parameters: */
@@ -305,17 +356,17 @@ const char *mz_version(void);
 /*  MZ_STREAM_ERROR if the stream is bogus. */
 /*  MZ_PARAM_ERROR if the input parameters are bogus. */
 /*  MZ_MEM_ERROR on out of memory. */
-int mz_deflateInit(mz_streamp pStream, int level);
+MINIZ_EXPORT int mz_deflateInit(mz_streamp pStream, int level);
 
 /* mz_deflateInit2() is like mz_deflate(), except with more control: */
 /* Additional parameters: */
 /*   method must be MZ_DEFLATED */
 /*   window_bits must be MZ_DEFAULT_WINDOW_BITS (to wrap the deflate stream with zlib header/adler-32 footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate/no header or footer) */
 /*   mem_level must be between [1, 9] (it's checked but ignored by miniz.c) */
-int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy);
+MINIZ_EXPORT int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy);
 
 /* Quickly resets a compressor without having to reallocate anything. Same as calling mz_deflateEnd() followed by mz_deflateInit()/mz_deflateInit2(). */
-int mz_deflateReset(mz_streamp pStream);
+MINIZ_EXPORT int mz_deflateReset(mz_streamp pStream);
 
 /* mz_deflate() compresses the input to output, consuming as much of the input and producing as much output as possible. */
 /* Parameters: */
@@ -327,31 +378,38 @@ int mz_deflateReset(mz_streamp pStream);
 /*   MZ_STREAM_ERROR if the stream is bogus. */
 /*   MZ_PARAM_ERROR if one of the parameters is invalid. */
 /*   MZ_BUF_ERROR if no forward progress is possible because the input and/or output buffers are empty. (Fill up the input buffer or free up some output space and try again.) */
-int mz_deflate(mz_streamp pStream, int flush);
+MINIZ_EXPORT int mz_deflate(mz_streamp pStream, int flush);
 
 /* mz_deflateEnd() deinitializes a compressor: */
 /* Return values: */
 /*  MZ_OK on success. */
 /*  MZ_STREAM_ERROR if the stream is bogus. */
-int mz_deflateEnd(mz_streamp pStream);
+MINIZ_EXPORT int mz_deflateEnd(mz_streamp pStream);
 
 /* mz_deflateBound() returns a (very) conservative upper bound on the amount of data that could be generated by deflate(), assuming flush is set to only MZ_NO_FLUSH or MZ_FINISH. */
-mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
+MINIZ_EXPORT mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
 
 /* Single-call compression functions mz_compress() and mz_compress2(): */
 /* Returns MZ_OK on success, or one of the error codes from mz_deflate() on failure. */
-int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
-int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level);
+MINIZ_EXPORT int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+MINIZ_EXPORT int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level);
 
 /* mz_compressBound() returns a (very) conservative upper bound on the amount of data that could be generated by calling mz_compress(). */
-mz_ulong mz_compressBound(mz_ulong source_len);
+MINIZ_EXPORT mz_ulong mz_compressBound(mz_ulong source_len);
+
+#endif /*#ifndef MINIZ_NO_DEFLATE_APIS*/
+
+#ifndef MINIZ_NO_INFLATE_APIS
 
 /* Initializes a decompressor. */
-int mz_inflateInit(mz_streamp pStream);
+MINIZ_EXPORT int mz_inflateInit(mz_streamp pStream);
 
 /* mz_inflateInit2() is like mz_inflateInit() with an additional option that controls the window size and whether or not the stream has been wrapped with a zlib header/footer: */
 /* window_bits must be MZ_DEFAULT_WINDOW_BITS (to parse zlib header/footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate). */
-int mz_inflateInit2(mz_streamp pStream, int window_bits);
+MINIZ_EXPORT int mz_inflateInit2(mz_streamp pStream, int window_bits);
+
+/* Quickly resets a compressor without having to reallocate anything. Same as calling mz_inflateEnd() followed by mz_inflateInit()/mz_inflateInit2(). */
+MINIZ_EXPORT int mz_inflateReset(mz_streamp pStream);
 
 /* Decompresses the input stream to the output, consuming only as much of the input as needed, and writing as much to the output as possible. */
 /* Parameters: */
@@ -367,17 +425,19 @@ int mz_inflateInit2(mz_streamp pStream, int window_bits);
 /*   MZ_PARAM_ERROR if one of the parameters is invalid. */
 /*   MZ_BUF_ERROR if no forward progress is possible because the input buffer is empty but the inflater needs more input to continue, or if the output buffer is not large enough. Call mz_inflate() again */
 /*   with more input data, or with more room in the output buffer (except when using single call decompression, described above). */
-int mz_inflate(mz_streamp pStream, int flush);
+MINIZ_EXPORT int mz_inflate(mz_streamp pStream, int flush);
 
 /* Deinitializes a decompressor. */
-int mz_inflateEnd(mz_streamp pStream);
+MINIZ_EXPORT int mz_inflateEnd(mz_streamp pStream);
 
 /* Single-call decompression. */
 /* Returns MZ_OK on success, or one of the error codes from mz_inflate() on failure. */
-int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+MINIZ_EXPORT int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+MINIZ_EXPORT int mz_uncompress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong *pSource_len);
+#endif /*#ifndef MINIZ_NO_INFLATE_APIS*/
 
 /* Returns a string description of the specified error code, or NULL if the error code is invalid. */
-const char *mz_error(int err);
+MINIZ_EXPORT const char *mz_error(int err);
 
 /* Redefine zlib-compatible names to miniz equivalents, so miniz.c can be used as a drop-in replacement for the subset of zlib that miniz.c supports. */
 /* Define MINIZ_NO_ZLIB_COMPATIBLE_NAMES to disable zlib-compatibility if you use zlib in the same project. */
@@ -425,6 +485,8 @@ typedef void *const voidpc;
 #define free_func mz_free_func
 #define internal_state mz_internal_state
 #define z_stream mz_stream
+
+#ifndef MINIZ_NO_DEFLATE_APIS
 #define deflateInit mz_deflateInit
 #define deflateInit2 mz_deflateInit2
 #define deflateReset mz_deflateReset
@@ -434,11 +496,18 @@ typedef void *const voidpc;
 #define compress mz_compress
 #define compress2 mz_compress2
 #define compressBound mz_compressBound
+#endif /*#ifndef MINIZ_NO_DEFLATE_APIS*/
+
+#ifndef MINIZ_NO_INFLATE_APIS
 #define inflateInit mz_inflateInit
 #define inflateInit2 mz_inflateInit2
+#define inflateReset mz_inflateReset
 #define inflate mz_inflate
 #define inflateEnd mz_inflateEnd
 #define uncompress mz_uncompress
+#define uncompress2 mz_uncompress2
+#endif /*#ifndef MINIZ_NO_INFLATE_APIS*/
+
 #define crc32 mz_crc32
 #define adler32 mz_adler32
 #define MAX_WBITS 15
@@ -459,11 +528,18 @@ typedef void *const voidpc;
 #ifdef __cplusplus
 }
 #endif
+
+
+
+
+
 #pragma once
 #include <assert.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdint.h>
+
+
 
 /* ------------------- Types and macros */
 typedef unsigned char mz_uint8;
@@ -495,7 +571,8 @@ typedef int mz_bool;
 #ifdef MINIZ_NO_TIME
 typedef struct mz_dummy_time_t_tag
 {
-    int m_dummy;
+    mz_uint32 m_dummy1;
+    mz_uint32 m_dummy2;
 } mz_dummy_time_t;
 #define MZ_TIME_T mz_dummy_time_t
 #else
@@ -506,7 +583,7 @@ typedef struct mz_dummy_time_t_tag
 
 #ifdef MINIZ_NO_MALLOC
 #define MZ_MALLOC(x) NULL
-#define MZ_FREE(x) (void) x, ((void)0)
+#define MZ_FREE(x) (void)x, ((void)0)
 #define MZ_REALLOC(p, x) NULL
 #else
 #define MZ_MALLOC(x) malloc(x)
@@ -517,8 +594,10 @@ typedef struct mz_dummy_time_t_tag
 #define MZ_MAX(a, b) (((a) > (b)) ? (a) : (b))
 #define MZ_MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
+#define MZ_CLEAR_ARR(obj) memset((obj), 0, sizeof(obj))
+#define MZ_CLEAR_PTR(obj) memset((obj), 0, sizeof(*obj))
 
-#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES &&MINIZ_LITTLE_ENDIAN
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
 #define MZ_READ_LE16(p) *((const mz_uint16 *)(p))
 #define MZ_READ_LE32(p) *((const mz_uint32 *)(p))
 #else
@@ -540,9 +619,9 @@ typedef struct mz_dummy_time_t_tag
 extern "C" {
 #endif
 
-extern void *miniz_def_alloc_func(void *opaque, size_t items, size_t size);
-extern void miniz_def_free_func(void *opaque, void *address);
-extern void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size);
+extern MINIZ_EXPORT void *miniz_def_alloc_func(void *opaque, size_t items, size_t size);
+extern MINIZ_EXPORT void miniz_def_free_func(void *opaque, void *address);
+extern MINIZ_EXPORT void *miniz_def_realloc_func(void *opaque, void *address, size_t items, size_t size);
 
 #define MZ_UINT16_MAX (0xFFFFU)
 #define MZ_UINT32_MAX (0xFFFFFFFFU)
@@ -550,9 +629,11 @@ extern void *miniz_def_realloc_func(void *opaque, void *address, size_t items, s
 #ifdef __cplusplus
 }
 #endif
-#pragma once
+ #pragma once
 
 
+#ifndef MINIZ_NO_DEFLATE_APIS
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -600,11 +681,11 @@ enum
 /*  Function returns a pointer to the compressed data, or NULL on failure. */
 /*  *pOut_len will be set to the compressed data's size, which could be larger than src_buf_len on uncompressible data. */
 /*  The caller must free() the returned block when it's no longer needed. */
-void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+MINIZ_EXPORT void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
 
 /* tdefl_compress_mem_to_mem() compresses a block in memory to another block in memory. */
 /* Returns 0 on failure. */
-size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+MINIZ_EXPORT size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
 
 /* Compresses an image to a compressed PNG file in memory. */
 /* On entry: */
@@ -616,14 +697,14 @@ size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void
 /*  Function returns a pointer to the compressed data, or NULL on failure. */
 /*  *pLen_out will be set to the size of the PNG image file. */
 /*  The caller must mz_free() the returned heap block (which will typically be larger than *pLen_out) when it's no longer needed. */
-void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip);
-void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out);
+MINIZ_EXPORT void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip);
+MINIZ_EXPORT void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out);
 
 /* Output stream interface. The compressor uses this interface to write compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time. */
 typedef mz_bool (*tdefl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
 
 /* tdefl_compress_mem_to_output() compresses a block to an output stream. The above helpers use this function internally. */
-mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+MINIZ_EXPORT mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
 
 enum
 {
@@ -663,8 +744,7 @@ enum
 #endif
 
 /* The low-level tdefl functions below may be used directly if the above helper functions aren't flexible enough. The low-level functions don't make any heap allocations, unlike the above helper functions. */
-typedef enum
-{
+typedef enum {
     TDEFL_STATUS_BAD_PARAM = -2,
     TDEFL_STATUS_PUT_BUF_FAILED = -1,
     TDEFL_STATUS_OKAY = 0,
@@ -672,8 +752,7 @@ typedef enum
 } tdefl_status;
 
 /* Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums */
-typedef enum
-{
+typedef enum {
     TDEFL_NO_FLUSH = 0,
     TDEFL_SYNC_FLUSH = 2,
     TDEFL_FULL_FLUSH = 3,
@@ -713,37 +792,43 @@ typedef struct
 /* pBut_buf_func: If NULL, output data will be supplied to the specified callback. In this case, the user should call the tdefl_compress_buffer() API for compression. */
 /* If pBut_buf_func is NULL the user should always call the tdefl_compress() API. */
 /* flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER, etc.) */
-tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+MINIZ_EXPORT tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
 
 /* Compresses a block of data, consuming as much of the specified input buffer as possible, and writing as much compressed data to the specified output buffer as possible. */
-tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
+MINIZ_EXPORT tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
 
 /* tdefl_compress_buffer() is only usable when the tdefl_init() is called with a non-NULL tdefl_put_buf_func_ptr. */
 /* tdefl_compress_buffer() always consumes the entire input buffer. */
-tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
+MINIZ_EXPORT tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
 
-tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
-mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+MINIZ_EXPORT tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+MINIZ_EXPORT mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
 
 /* Create tdefl_compress() flags given zlib-style compression parameters. */
 /* level may range from [0,10] (where 10 is absolute max compression, but may be much slower on some files) */
 /* window_bits may be -15 (raw deflate) or 15 (zlib) */
 /* strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY, MZ_RLE, or MZ_FIXED */
-mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
+MINIZ_EXPORT mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
 
+#ifndef MINIZ_NO_MALLOC
 /* Allocate the tdefl_compressor structure in C so that */
 /* non-C language bindings to tdefl_ API don't need to worry about */
 /* structure size and allocation mechanism. */
-tdefl_compressor *tdefl_compressor_alloc();
-void tdefl_compressor_free(tdefl_compressor *pComp);
+MINIZ_EXPORT tdefl_compressor *tdefl_compressor_alloc(void);
+MINIZ_EXPORT void tdefl_compressor_free(tdefl_compressor *pComp);
+#endif
 
 #ifdef __cplusplus
 }
 #endif
-#pragma once
+
+#endif /*#ifndef MINIZ_NO_DEFLATE_APIS*/
+ #pragma once
 
 /* ------------------- Low-level Decompression API Definitions */
 
+#ifndef MINIZ_NO_INFLATE_APIS
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -768,34 +853,34 @@ enum
 /*  Function returns a pointer to the decompressed data, or NULL on failure. */
 /*  *pOut_len will be set to the decompressed data's size, which could be larger than src_buf_len on uncompressible data. */
 /*  The caller must call mz_free() on the returned block when it's no longer needed. */
-void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+MINIZ_EXPORT void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
 
 /* tinfl_decompress_mem_to_mem() decompresses a block in memory to another block in memory. */
 /* Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes written on success. */
 #define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
-size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+MINIZ_EXPORT size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
 
 /* tinfl_decompress_mem_to_callback() decompresses a block in memory to an internal 32KB buffer, and a user provided callback function will be called to flush the buffer. */
 /* Returns 1 on success or 0 on failure. */
 typedef int (*tinfl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
-int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+MINIZ_EXPORT int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
 
 struct tinfl_decompressor_tag;
 typedef struct tinfl_decompressor_tag tinfl_decompressor;
 
+#ifndef MINIZ_NO_MALLOC
 /* Allocate the tinfl_decompressor structure in C so that */
 /* non-C language bindings to tinfl_ API don't need to worry about */
 /* structure size and allocation mechanism. */
-
-tinfl_decompressor *tinfl_decompressor_alloc();
-void tinfl_decompressor_free(tinfl_decompressor *pDecomp);
+MINIZ_EXPORT tinfl_decompressor *tinfl_decompressor_alloc(void);
+MINIZ_EXPORT void tinfl_decompressor_free(tinfl_decompressor *pDecomp);
+#endif
 
 /* Max size of LZ dictionary. */
 #define TINFL_LZ_DICT_SIZE 32768
 
 /* Return status. */
-typedef enum
-{
+typedef enum {
     /* This flags indicates the inflator needs 1 or more input bytes to make forward progress, but the caller is indicating that no more are available. The compressed data */
     /* is probably corrupted. If you call the inflator again with more bytes it'll try to continue processing the input but this is a BAD sign (either the data is corrupted or you called it incorrectly). */
     /* If you call it again with no input you'll just get TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS again. */
@@ -839,7 +924,7 @@ typedef enum
 
 /* Main low-level decompressor coroutine function. This is the only function actually needed for decompression. All the other functions are just high-level helpers for improved usability. */
 /* This is a universal API, i.e. it can be used as a building block to build any desired higher level decompression API. In the limit case, it can be called once per every byte input or output. */
-tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
+MINIZ_EXPORT tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
 
 /* Internal/private bits follow. */
 enum
@@ -852,14 +937,10 @@ enum
     TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
 };
 
-typedef struct
-{
-    mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
-    mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
-} tinfl_huff_table;
-
 #if MINIZ_HAS_64BIT_REGISTERS
 #define TINFL_USE_64BIT_BITBUF 1
+#else
+#define TINFL_USE_64BIT_BITBUF 0
 #endif
 
 #if TINFL_USE_64BIT_BITBUF
@@ -875,7 +956,13 @@ struct tinfl_decompressor_tag
     mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
     tinfl_bit_buf_t m_bit_buf;
     size_t m_dist_from_out_buf_start;
-    tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+    mz_int16 m_look_up[TINFL_MAX_HUFF_TABLES][TINFL_FAST_LOOKUP_SIZE];
+    mz_int16 m_tree_0[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+    mz_int16 m_tree_1[TINFL_MAX_HUFF_SYMBOLS_1 * 2];
+    mz_int16 m_tree_2[TINFL_MAX_HUFF_SYMBOLS_2 * 2];
+    mz_uint8 m_code_size_0[TINFL_MAX_HUFF_SYMBOLS_0];
+    mz_uint8 m_code_size_1[TINFL_MAX_HUFF_SYMBOLS_1];
+    mz_uint8 m_code_size_2[TINFL_MAX_HUFF_SYMBOLS_2];
     mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
 };
 
@@ -883,6 +970,8 @@ struct tinfl_decompressor_tag
 }
 #endif
 
+#endif /*#ifndef MINIZ_NO_INFLATE_APIS*/
+ 
 #pragma once
 
 
@@ -916,10 +1005,6 @@ typedef struct
     mz_uint16 m_bit_flag;
     mz_uint16 m_method;
 
-#ifndef MINIZ_NO_TIME
-    MZ_TIME_T m_time;
-#endif
-
     /* CRC-32 of uncompressed data. */
     mz_uint32 m_crc32;
 
@@ -956,6 +1041,11 @@ typedef struct
     /* Guaranteed to be zero terminated, may be truncated to fit. */
     char m_comment[MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE];
 
+#ifdef MINIZ_NO_TIME
+    MZ_TIME_T m_padding;
+#else
+    MZ_TIME_T m_time;
+#endif
 } mz_zip_archive_file_stat;
 
 typedef size_t (*mz_file_read_func)(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n);
@@ -965,29 +1055,29 @@ typedef mz_bool (*mz_file_needs_keepalive)(void *pOpaque);
 struct mz_zip_internal_state_tag;
 typedef struct mz_zip_internal_state_tag mz_zip_internal_state;
 
-typedef enum
-{
+typedef enum {
     MZ_ZIP_MODE_INVALID = 0,
     MZ_ZIP_MODE_READING = 1,
     MZ_ZIP_MODE_WRITING = 2,
     MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED = 3
 } mz_zip_mode;
 
-typedef enum
-{
+typedef enum {
     MZ_ZIP_FLAG_CASE_SENSITIVE = 0x0100,
     MZ_ZIP_FLAG_IGNORE_PATH = 0x0200,
     MZ_ZIP_FLAG_COMPRESSED_DATA = 0x0400,
     MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY = 0x0800,
     MZ_ZIP_FLAG_VALIDATE_LOCATE_FILE_FLAG = 0x1000, /* if enabled, mz_zip_reader_locate_file() will be called on each file as its validated to ensure the func finds the file in the central dir (intended for testing) */
     MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY = 0x2000,     /* validate the local headers, but don't decompress the entire file and check the crc32 */
-    MZ_ZIP_FLAG_WRITE_ZIP64 = 0x4000,               /* use the zip64 file format, instead of the original zip file format */
+    MZ_ZIP_FLAG_WRITE_ZIP64 = 0x4000,               /* always use the zip64 file format, instead of the original zip file format with automatic switch to zip64. Use as flags parameter with mz_zip_writer_init*_v2 */
     MZ_ZIP_FLAG_WRITE_ALLOW_READING = 0x8000,
-    MZ_ZIP_FLAG_ASCII_FILENAME = 0x10000
+    MZ_ZIP_FLAG_ASCII_FILENAME = 0x10000,
+    /*After adding a compressed file, seek back
+    to local file header and set the correct sizes*/
+    MZ_ZIP_FLAG_WRITE_HEADER_SET_SIZE = 0x20000
 } mz_zip_flags;
 
-typedef enum
-{
+typedef enum {
     MZ_ZIP_TYPE_INVALID = 0,
     MZ_ZIP_TYPE_USER,
     MZ_ZIP_TYPE_MEMORY,
@@ -998,8 +1088,7 @@ typedef enum
 } mz_zip_type;
 
 /* miniz error codes. Be sure to update mz_zip_get_error_string() if you add or modify this enum. */
-typedef enum
-{
+typedef enum {
     MZ_ZIP_NO_ERROR = 0,
     MZ_ZIP_UNDEFINED_ERROR,
     MZ_ZIP_TOO_MANY_FILES,
@@ -1055,170 +1144,198 @@ typedef struct
 
     mz_file_read_func m_pRead;
     mz_file_write_func m_pWrite;
-	mz_file_needs_keepalive m_pNeeds_keepalive;
+    mz_file_needs_keepalive m_pNeeds_keepalive;
     void *m_pIO_opaque;
 
     mz_zip_internal_state *m_pState;
 
 } mz_zip_archive;
 
+typedef struct
+{
+    mz_zip_archive *pZip;
+    mz_uint flags;
+
+    int status;
+
+    mz_uint64 read_buf_size, read_buf_ofs, read_buf_avail, comp_remaining, out_buf_ofs, cur_file_ofs;
+    mz_zip_archive_file_stat file_stat;
+    void *pRead_buf;
+    void *pWrite_buf;
+
+    size_t out_blk_remain;
+
+    tinfl_decompressor inflator;
+
+#ifdef MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
+    mz_uint padding;
+#else
+    mz_uint file_crc32;
+#endif
+
+} mz_zip_reader_extract_iter_state;
+
 /* -------- ZIP reading */
 
 /* Inits a ZIP archive reader. */
 /* These functions read and validate the archive's central directory. */
-mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size, mz_uint flags);
 
-mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, size_t size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, size_t size, mz_uint flags);
 
 #ifndef MINIZ_NO_STDIO
 /* Read a archive from a disk file. */
 /* file_start_ofs is the file offset where the archive actually begins, or 0. */
 /* actual_archive_size is the true total size of the archive, which may be smaller than the file's actual size on disk. If zero the entire file is treated as the archive. */
-mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint32 flags);
-mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags, mz_uint64 file_start_ofs, mz_uint64 archive_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint32 flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags, mz_uint64 file_start_ofs, mz_uint64 archive_size);
 
 /* Read an archive from an already opened FILE, beginning at the current file position. */
-/* The archive is assumed to be archive_size bytes long. If archive_size is < 0, then the entire rest of the file is assumed to contain the archive. */
+/* The archive is assumed to be archive_size bytes long. If archive_size is 0, then the entire rest of the file is assumed to contain the archive. */
 /* The FILE will NOT be closed when mz_zip_reader_end() is called. */
-mz_bool mz_zip_reader_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint64 archive_size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint64 archive_size, mz_uint flags);
 #endif
 
 /* Ends archive reading, freeing all allocations, and closing the input archive file if mz_zip_reader_init_file() was used. */
-mz_bool mz_zip_reader_end(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_bool mz_zip_reader_end(mz_zip_archive *pZip);
 
 /* -------- ZIP reading or writing */
 
 /* Clears a mz_zip_archive struct to all zeros. */
 /* Important: This must be done before passing the struct to any mz_zip functions. */
-void mz_zip_zero_struct(mz_zip_archive *pZip);
+MINIZ_EXPORT void mz_zip_zero_struct(mz_zip_archive *pZip);
 
-mz_zip_mode mz_zip_get_mode(mz_zip_archive *pZip);
-mz_zip_type mz_zip_get_type(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_mode mz_zip_get_mode(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_type mz_zip_get_type(mz_zip_archive *pZip);
 
 /* Returns the total number of files in the archive. */
-mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip);
 
-mz_uint64 mz_zip_get_archive_size(mz_zip_archive *pZip);
-mz_uint64 mz_zip_get_archive_file_start_offset(mz_zip_archive *pZip);
-MZ_FILE *mz_zip_get_cfile(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_uint64 mz_zip_get_archive_size(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_uint64 mz_zip_get_archive_file_start_offset(mz_zip_archive *pZip);
+MINIZ_EXPORT MZ_FILE *mz_zip_get_cfile(mz_zip_archive *pZip);
 
 /* Reads n bytes of raw archive data, starting at file offset file_ofs, to pBuf. */
-size_t mz_zip_read_archive_data(mz_zip_archive *pZip, mz_uint64 file_ofs, void *pBuf, size_t n);
-
-/* Attempts to locates a file in the archive's central directory. */
-/* Valid flags: MZ_ZIP_FLAG_CASE_SENSITIVE, MZ_ZIP_FLAG_IGNORE_PATH */
-/* Returns -1 if the file cannot be found. */
-int mz_zip_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags);
-/* Returns MZ_FALSE if the file cannot be found. */
-mz_bool mz_zip_locate_file_v2(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags, mz_uint32 *pIndex);
+MINIZ_EXPORT size_t mz_zip_read_archive_data(mz_zip_archive *pZip, mz_uint64 file_ofs, void *pBuf, size_t n);
 
 /* All mz_zip funcs set the m_last_error field in the mz_zip_archive struct. These functions retrieve/manipulate this field. */
 /* Note that the m_last_error functionality is not thread safe. */
-mz_zip_error mz_zip_set_last_error(mz_zip_archive *pZip, mz_zip_error err_num);
-mz_zip_error mz_zip_peek_last_error(mz_zip_archive *pZip);
-mz_zip_error mz_zip_clear_last_error(mz_zip_archive *pZip);
-mz_zip_error mz_zip_get_last_error(mz_zip_archive *pZip);
-const char *mz_zip_get_error_string(mz_zip_error mz_err);
+MINIZ_EXPORT mz_zip_error mz_zip_set_last_error(mz_zip_archive *pZip, mz_zip_error err_num);
+MINIZ_EXPORT mz_zip_error mz_zip_peek_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_error mz_zip_clear_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_zip_error mz_zip_get_last_error(mz_zip_archive *pZip);
+MINIZ_EXPORT const char *mz_zip_get_error_string(mz_zip_error mz_err);
 
 /* MZ_TRUE if the archive file entry is a directory entry. */
-mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip, mz_uint file_index);
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip, mz_uint file_index);
 
 /* MZ_TRUE if the file is encrypted/strong encrypted. */
-mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip, mz_uint file_index);
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip, mz_uint file_index);
 
 /* MZ_TRUE if the compression method is supported, and the file is not encrypted, and the file is not a compressed patch file. */
-mz_bool mz_zip_reader_is_file_supported(mz_zip_archive *pZip, mz_uint file_index);
+MINIZ_EXPORT mz_bool mz_zip_reader_is_file_supported(mz_zip_archive *pZip, mz_uint file_index);
 
 /* Retrieves the filename of an archive file entry. */
 /* Returns the number of bytes written to pFilename, or if filename_buf_size is 0 this function returns the number of bytes needed to fully store the filename. */
-mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index, char *pFilename, mz_uint filename_buf_size);
+MINIZ_EXPORT mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index, char *pFilename, mz_uint filename_buf_size);
 
 /* Attempts to locates a file in the archive's central directory. */
 /* Valid flags: MZ_ZIP_FLAG_CASE_SENSITIVE, MZ_ZIP_FLAG_IGNORE_PATH */
 /* Returns -1 if the file cannot be found. */
-int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags);
-int mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags, mz_uint32 *file_index);
+MINIZ_EXPORT int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *pName, const char *pComment, mz_uint flags, mz_uint32 *file_index);
 
 /* Returns detailed information about an archive file entry. */
-mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index, mz_zip_archive_file_stat *pStat);
+MINIZ_EXPORT mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index, mz_zip_archive_file_stat *pStat);
 
 /* MZ_TRUE if the file is in zip64 format. */
 /* A file is considered zip64 if it contained a zip64 end of central directory marker, or if it contained any zip64 extended file information fields in the central directory. */
-mz_bool mz_zip_is_zip64(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_bool mz_zip_is_zip64(mz_zip_archive *pZip);
 
 /* Returns the total central directory size in bytes. */
 /* The current max supported size is <= MZ_UINT32_MAX. */
-size_t mz_zip_get_central_dir_size(mz_zip_archive *pZip);
+MINIZ_EXPORT size_t mz_zip_get_central_dir_size(mz_zip_archive *pZip);
 
 /* Extracts a archive file to a memory buffer using no memory allocation. */
 /* There must be at least enough room on the stack to store the inflator's state (~34KB or so). */
-mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
-mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
 
 /* Extracts a archive file to a memory buffer. */
-mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags);
-mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index, void *pBuf, size_t buf_size, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size, mz_uint flags);
 
 /* Extracts a archive file to a dynamically allocated heap buffer. */
 /* The memory will be allocated via the mz_zip_archive's alloc/realloc functions. */
 /* Returns NULL and sets the last error on failure. */
-void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, size_t *pSize, mz_uint flags);
-void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip, const char *pFilename, size_t *pSize, mz_uint flags);
+MINIZ_EXPORT void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index, size_t *pSize, mz_uint flags);
+MINIZ_EXPORT void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip, const char *pFilename, size_t *pSize, mz_uint flags);
 
 /* Extracts a archive file using a callback function to output the file's data. */
-mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_index, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
-mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, const char *pFilename, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint file_index, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, const char *pFilename, mz_file_write_func pCallback, void *pOpaque, mz_uint flags);
+
+/* Extract a file iteratively */
+MINIZ_EXPORT mz_zip_reader_extract_iter_state* mz_zip_reader_extract_iter_new(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
+MINIZ_EXPORT mz_zip_reader_extract_iter_state* mz_zip_reader_extract_file_iter_new(mz_zip_archive *pZip, const char *pFilename, mz_uint flags);
+MINIZ_EXPORT size_t mz_zip_reader_extract_iter_read(mz_zip_reader_extract_iter_state* pState, void* pvBuf, size_t buf_size);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_iter_free(mz_zip_reader_extract_iter_state* pState);
 
 #ifndef MINIZ_NO_STDIO
 /* Extracts a archive file to a disk file and sets its last accessed and modified times. */
 /* This function only extracts files, not archive directory records. */
-mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index, const char *pDst_filename, mz_uint flags);
-mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip, const char *pArchive_filename, const char *pDst_filename, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index, const char *pDst_filename, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip, const char *pArchive_filename, const char *pDst_filename, mz_uint flags);
 
 /* Extracts a archive file starting at the current position in the destination FILE stream. */
-mz_bool mz_zip_reader_extract_to_cfile(mz_zip_archive *pZip, mz_uint file_index, MZ_FILE *File, mz_uint flags);
-mz_bool mz_zip_reader_extract_file_to_cfile(mz_zip_archive *pZip, const char *pArchive_filename, MZ_FILE *pFile, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_to_cfile(mz_zip_archive *pZip, mz_uint file_index, MZ_FILE *File, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_reader_extract_file_to_cfile(mz_zip_archive *pZip, const char *pArchive_filename, MZ_FILE *pFile, mz_uint flags);
 #endif
 
 #if 0
 /* TODO */
 	typedef void *mz_zip_streaming_extract_state_ptr;
 	mz_zip_streaming_extract_state_ptr mz_zip_streaming_extract_begin(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
-	uint64_t mz_zip_streaming_extract_get_size(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
-	uint64_t mz_zip_streaming_extract_get_cur_ofs(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
-	mz_bool mz_zip_streaming_extract_seek(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState, uint64_t new_ofs);
+	mz_uint64 mz_zip_streaming_extract_get_size(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
+	mz_uint64 mz_zip_streaming_extract_get_cur_ofs(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
+	mz_bool mz_zip_streaming_extract_seek(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState, mz_uint64 new_ofs);
 	size_t mz_zip_streaming_extract_read(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState, void *pBuf, size_t buf_size);
 	mz_bool mz_zip_streaming_extract_end(mz_zip_archive *pZip, mz_zip_streaming_extract_state_ptr pState);
 #endif
 
 /* This function compares the archive's local headers, the optional local zip64 extended information block, and the optional descriptor following the compressed data vs. the data in the central directory. */
 /* It also validates that each file can be successfully uncompressed unless the MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY is specified. */
-mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, mz_uint flags);
 
 /* Validates an entire archive by calling mz_zip_validate_file() on each file. */
-mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags);
 
 /* Misc utils/helpers, valid for ZIP reading or writing */
-mz_bool mz_zip_validate_mem_archive(const void *pMem, size_t size, mz_uint flags, mz_zip_error *pErr);
-mz_bool mz_zip_validate_file_archive(const char *pFilename, mz_uint flags, mz_zip_error *pErr);
+MINIZ_EXPORT mz_bool mz_zip_validate_mem_archive(const void *pMem, size_t size, mz_uint flags, mz_zip_error *pErr);
+#ifndef MINIZ_NO_STDIO
+MINIZ_EXPORT mz_bool mz_zip_validate_file_archive(const char *pFilename, mz_uint flags, mz_zip_error *pErr);
+#endif
 
 /* Universal end function - calls either mz_zip_reader_end() or mz_zip_writer_end(). */
-mz_bool mz_zip_end(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_bool mz_zip_end(mz_zip_archive *pZip);
 
 /* -------- ZIP writing */
 
 #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
 
 /* Inits a ZIP archive writer. */
-mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size);
-mz_bool mz_zip_writer_init_v2(mz_zip_archive *pZip, mz_uint64 existing_size, mz_uint flags);
-mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size);
-mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size, mz_uint flags);
+/*Set pZip->m_pWrite (and pZip->m_pIO_opaque) before calling mz_zip_writer_init or mz_zip_writer_init_v2*/
+/*The output is streamable, i.e. file_ofs in mz_file_write_func always increases only by n*/
+MINIZ_EXPORT mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_v2(mz_zip_archive *pZip, mz_uint64 existing_size, mz_uint flags);
+
+MINIZ_EXPORT mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_reserve_at_beginning, size_t initial_allocation_size, mz_uint flags);
 
 #ifndef MINIZ_NO_STDIO
-mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning);
-mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning, mz_uint flags);
-mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint64 size_to_reserve_at_beginning, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint flags);
 #endif
 
 /* Converts a ZIP archive reader object into a writer object, to allow efficient in-place file appends to occur on an existing archive. */
@@ -1227,50 +1344,57 @@ mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, mz_uint f
 /* Finally, for archives opened using mz_zip_reader_init, the mz_zip_archive's user provided m_pWrite function cannot be NULL. */
 /* Note: In-place archive modification is not recommended unless you know what you're doing, because if execution stops or something goes wrong before */
 /* the archive is finalized the file's central directory will be hosed. */
-mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip, const char *pFilename);
-mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip, const char *pFilename);
+MINIZ_EXPORT mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const char *pFilename, mz_uint flags);
 
 /* Adds the contents of a memory buffer to an archive. These functions record the current local time into the archive. */
 /* To add a directory entry, call this method with an archive name ending in a forwardslash with an empty buffer. */
 /* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
-mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, mz_uint level_and_flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, mz_uint level_and_flags);
 
 /* Like mz_zip_writer_add_mem(), except you can specify a file comment field, and optionally supply the function with already compressed data. */
 /* uncomp_size/uncomp_crc32 are only used if the MZ_ZIP_FLAG_COMPRESSED_DATA flag is specified. */
-mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
-                                 mz_uint64 uncomp_size, mz_uint32 uncomp_crc32);
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                              mz_uint64 uncomp_size, mz_uint32 uncomp_crc32);
+
+MINIZ_EXPORT mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
+                                                 mz_uint64 uncomp_size, mz_uint32 uncomp_crc32, MZ_TIME_T *last_modified, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
+                                                 const char *user_extra_data_central, mz_uint user_extra_data_central_len);
+
+/* Adds the contents of a file to an archive. This function also records the disk file's modified time into the archive. */
+/* File data is supplied via a read callback function. User mz_zip_writer_add_(c)file to add a file directly.*/
+MINIZ_EXPORT mz_bool mz_zip_writer_add_read_buf_callback(mz_zip_archive *pZip, const char *pArchive_name, mz_file_read_func read_callback, void* callback_opaque, mz_uint64 max_size,
+	const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
+	const char *user_extra_data_central, mz_uint user_extra_data_central_len);
 
-mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags,
-                                    mz_uint64 uncomp_size, mz_uint32 uncomp_crc32, MZ_TIME_T *last_modified, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
-                                    const char *user_extra_data_central, mz_uint user_extra_data_central_len);
 
 #ifndef MINIZ_NO_STDIO
 /* Adds the contents of a disk file to an archive. This function also records the disk file's modified time into the archive. */
 /* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
-mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name, const char *pSrc_filename, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
+MINIZ_EXPORT mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name, const char *pSrc_filename, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
 
 /* Like mz_zip_writer_add_file(), except the file data is read from the specified FILE stream. */
-mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 size_to_add,
+MINIZ_EXPORT mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchive_name, MZ_FILE *pSrc_file, mz_uint64 max_size,
                                 const MZ_TIME_T *pFile_time, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, const char *user_extra_data_local, mz_uint user_extra_data_local_len,
                                 const char *user_extra_data_central, mz_uint user_extra_data_central_len);
 #endif
 
 /* Adds a file to an archive by fully cloning the data from another archive. */
 /* This function fully clones the source file's compressed data (no recompression), along with its full filename, extra data (it may add or modify the zip64 local header extra data field), and the optional descriptor following the compressed data. */
-mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *pSource_zip, mz_uint src_file_index);
+MINIZ_EXPORT mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_archive *pSource_zip, mz_uint src_file_index);
 
 /* Finalizes the archive by writing the central directory records followed by the end of central directory record. */
 /* After an archive is finalized, the only valid call on the mz_zip_archive struct is mz_zip_writer_end(). */
 /* An archive must be manually finalized by calling this function for it to be valid. */
-mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip);
 
-/* Finalizes a heap archive, returning a poiner to the heap block and its size. */
+/* Finalizes a heap archive, returning a pointer to the heap block and its size. */
 /* The heap block will be allocated using the mz_zip_archive's alloc/realloc callbacks. */
-mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **ppBuf, size_t *pSize);
+MINIZ_EXPORT mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **ppBuf, size_t *pSize);
 
 /* Ends archive writing, freeing all allocations, and closing the output file if mz_zip_writer_init_file() was used. */
 /* Note for the archive to be valid, it *must* have been finalized before ending (this function will not do it for you). */
-mz_bool mz_zip_writer_end(mz_zip_archive *pZip);
+MINIZ_EXPORT mz_bool mz_zip_writer_end(mz_zip_archive *pZip);
 
 /* -------- Misc. high-level helper functions: */
 
@@ -1278,14 +1402,16 @@ mz_bool mz_zip_writer_end(mz_zip_archive *pZip);
 /* Note this is NOT a fully safe operation. If it crashes or dies in some way your archive can be left in a screwed up state (without a central directory). */
 /* level_and_flags - compression level (0-10, see MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or just set to MZ_DEFAULT_COMPRESSION. */
 /* TODO: Perhaps add an option to leave the existing central dir in place in case the add dies? We could then truncate the file (so the old central dir would be at the end) if something goes wrong. */
-mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
-mz_bool mz_zip_add_mem_to_archive_file_in_place_v2(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, mz_zip_error *pErr);
+MINIZ_EXPORT mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags);
+MINIZ_EXPORT mz_bool mz_zip_add_mem_to_archive_file_in_place_v2(const char *pZip_filename, const char *pArchive_name, const void *pBuf, size_t buf_size, const void *pComment, mz_uint16 comment_size, mz_uint level_and_flags, mz_zip_error *pErr);
 
+#ifndef MINIZ_NO_STDIO
 /* Reads a single file from an archive into a heap block. */
 /* If pComment is not NULL, only the file with the specified comment will be extracted. */
 /* Returns NULL on failure. */
-void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const char *pArchive_name, size_t *pSize, mz_uint flags);
-void *mz_zip_extract_archive_file_to_heap_v2(const char *pZip_filename, const char *pArchive_name, const char *pComment, size_t *pSize, mz_uint flags, mz_zip_error *pErr);
+MINIZ_EXPORT void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const char *pArchive_name, size_t *pSize, mz_uint flags);
+MINIZ_EXPORT void *mz_zip_extract_archive_file_to_heap_v2(const char *pZip_filename, const char *pArchive_name, const char *pComment, size_t *pSize, mz_uint flags, mz_zip_error *pErr);
+#endif
 
 #endif /* #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS */
 
diff --git a/Source/ThirdParty/tinyexr/tinyexr.Build.cs b/Source/ThirdParty/tinyexr/tinyexr.Build.cs
index 3499f6ed0..a513740c8 100644
--- a/Source/ThirdParty/tinyexr/tinyexr.Build.cs
+++ b/Source/ThirdParty/tinyexr/tinyexr.Build.cs
@@ -2,11 +2,12 @@
 
 using Flax.Build;
 using Flax.Build.NativeCpp;
+using System.IO;
 
 /// <summary>
 /// https://github.com/syoyo/tinyexr
 /// </summary>
-public class tinyexr : HeaderOnlyModule
+public class tinyexr : ThirdPartyModule
 {
     /// <inheritdoc />
     public override void Init()
@@ -19,4 +20,12 @@ public class tinyexr : HeaderOnlyModule
         // Merge third-party modules into engine binary
         BinaryModuleName = "FlaxEngine";
     }
+
+    /// <inheritdoc />
+    public override void Setup(BuildOptions options)
+    {
+        base.Setup(options);
+
+        options.PublicIncludePaths.Add(Path.Combine(Globals.EngineRoot, "Source/ThirdParty/tinyexr"));
+    }
 }
diff --git a/Source/ThirdParty/tinyexr/tinyexr.h b/Source/ThirdParty/tinyexr/tinyexr.h
index 929c3f253..64ee67f2c 100644
--- a/Source/ThirdParty/tinyexr/tinyexr.h
+++ b/Source/ThirdParty/tinyexr/tinyexr.h
@@ -659,7 +659,7 @@ extern int LoadEXRFromMemory(float **out_rgba, int *width, int *height,
 #endif
 
 #if defined(TINYEXR_USE_MINIZ) && (TINYEXR_USE_MINIZ==1)
-#include <ThirdParty/OpenFBX/miniz.h>
+#include <miniz.h>
 #else
 //  Issue #46. Please include your own zlib-compatible API header before
 //  including `tinyexr.h`
@@ -4923,7 +4923,7 @@ static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
   }
 #endif
   exr_image->tiles = static_cast<EXRTile*>(
-    calloc(sizeof(EXRTile), static_cast<size_t>(num_tiles)));
+    calloc(static_cast<size_t>(num_tiles), sizeof(EXRTile)));
 
 #if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
   std::vector<std::thread> workers;
@@ -9287,9 +9287,6 @@ int SaveEXR(const float *data, int width, int height, int components,
   }
 
   int ret = SaveEXRImageToFile(&image, &header, outfilename, err);
-  if (ret != TINYEXR_SUCCESS) {
-    return ret;
-  }
 
   free(header.channels);
   free(header.pixel_types);