diff --git a/Source/ThirdParty/meshoptimizer/LICENSE.md b/Source/ThirdParty/meshoptimizer/LICENSE.md index 4fcd766d2..962ed41ff 100644 --- a/Source/ThirdParty/meshoptimizer/LICENSE.md +++ b/Source/ThirdParty/meshoptimizer/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016-2020 Arseny Kapoulkine +Copyright (c) 2016-2023 Arseny Kapoulkine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Source/ThirdParty/meshoptimizer/allocator.cpp b/Source/ThirdParty/meshoptimizer/allocator.cpp index da7cc540b..072e8e51a 100644 --- a/Source/ThirdParty/meshoptimizer/allocator.cpp +++ b/Source/ThirdParty/meshoptimizer/allocator.cpp @@ -1,7 +1,7 @@ // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details #include "meshoptimizer.h" -void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*)) +void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*)) { meshopt_Allocator::Storage::allocate = allocate; meshopt_Allocator::Storage::deallocate = deallocate; diff --git a/Source/ThirdParty/meshoptimizer/clusterizer.cpp b/Source/ThirdParty/meshoptimizer/clusterizer.cpp index f7d88c513..c4672ad60 100644 --- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp +++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp @@ -2,6 +2,7 @@ #include "meshoptimizer.h" #include +#include #include #include @@ -12,6 +13,68 @@ namespace meshopt { +// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet +const size_t kMeshletMaxVertices = 255; + +// A reasonable limit is around 2*max_vertices or less +const size_t kMeshletMaxTriangles = 512; + +struct TriangleAdjacency2 +{ + unsigned int* counts; + unsigned int* offsets; + unsigned int* data; +}; + +static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + size_t face_count = index_count / 3; + + // allocate arrays + adjacency.counts = allocator.allocate(vertex_count); + adjacency.offsets = allocator.allocate(vertex_count); + adjacency.data = allocator.allocate(index_count); + + // fill triangle counts + memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + + adjacency.counts[indices[i]]++; + } + + // fill offset table + unsigned int offset = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + adjacency.offsets[i] = offset; + offset += adjacency.counts[i]; + } + + assert(offset == index_count); + + // fill triangle data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = unsigned(i); + adjacency.data[adjacency.offsets[b]++] = unsigned(i); + adjacency.data[adjacency.offsets[c]++] = unsigned(i); + } + + // fix offsets that have been disturbed by the previous pass + for (size_t i = 0; i < vertex_count; ++i) + { + assert(adjacency.offsets[i] >= adjacency.counts[i]); + + adjacency.offsets[i] -= adjacency.counts[i]; + } +} + static void computeBoundingSphere(float result[4], const float points[][3], size_t count) { assert(count > 0); @@ -82,13 +145,382 @@ static void computeBoundingSphere(float result[4], const float points[][3], size result[3] = radius; } +struct Cone +{ + float px, py, pz; + float nx, ny, nz; +}; + +static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius) +{ + float cone = 1.f - spread * cone_weight; + float cone_clamped = cone < 1e-3f ? 1e-3f : cone; + + return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped; +} + +static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count) +{ + Cone result = acc; + + float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count); + + result.px *= center_scale; + result.py *= center_scale; + result.pz *= center_scale; + + float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz; + float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length); + + result.nx *= axis_scale; + result.ny *= axis_scale; + result.nz *= axis_scale; + + return result; +} + +static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + (void)vertex_count; + + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + size_t face_count = index_count / 3; + + float mesh_area = 0; + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + const float* p0 = vertex_positions + vertex_stride_float * a; + const float* p1 = vertex_positions + vertex_stride_float * b; + const float* p2 = vertex_positions + vertex_stride_float * c; + + float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; + float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; + + float normalx = p10[1] * p20[2] - p10[2] * p20[1]; + float normaly = p10[2] * p20[0] - p10[0] * p20[2]; + float normalz = p10[0] * p20[1] - p10[1] * p20[0]; + + float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); + float invarea = (area == 0.f) ? 0.f : 1.f / area; + + triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f; + triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f; + triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f; + + triangles[i].nx = normalx * invarea; + triangles[i].ny = normaly * invarea; + triangles[i].nz = normalz * invarea; + + mesh_area += area; + } + + return mesh_area; +} + +static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles) +{ + size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3; + + // fill 4b padding with 0 + while (offset & 3) + meshlet_triangles[offset++] = 0; +} + +static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles) +{ + unsigned char& av = used[a]; + unsigned char& bv = used[b]; + unsigned char& cv = used[c]; + + bool result = false; + + unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); + + if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) + { + meshlets[meshlet_offset] = meshlet; + + for (size_t j = 0; j < meshlet.vertex_count; ++j) + used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff; + + finishMeshlet(meshlet, meshlet_triangles); + + meshlet.vertex_offset += meshlet.vertex_count; + meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding + meshlet.vertex_count = 0; + meshlet.triangle_count = 0; + + result = true; + } + + if (av == 0xff) + { + av = (unsigned char)meshlet.vertex_count; + meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a; + } + + if (bv == 0xff) + { + bv = (unsigned char)meshlet.vertex_count; + meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b; + } + + if (cv == 0xff) + { + cv = (unsigned char)meshlet.vertex_count; + meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c; + } + + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv; + meshlet.triangle_count++; + + return result; +} + +static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra) +{ + unsigned int best_triangle = ~0u; + unsigned int best_extra = 5; + float best_score = FLT_MAX; + + for (size_t i = 0; i < meshlet.vertex_count; ++i) + { + unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; + + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; + + for (size_t j = 0; j < neighbors_size; ++j) + { + unsigned int triangle = neighbors[j]; + unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; + + unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + + // triangles that don't add new vertices to meshlets are max. priority + if (extra != 0) + { + // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets + if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) + extra = 0; + + extra++; + } + + // since topology-based priority is always more important than the score, we can skip scoring in some cases + if (extra > best_extra) + continue; + + float score = 0; + + // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles) + if (meshlet_cone) + { + const Cone& tri_cone = triangles[triangle]; + + float distance2 = + (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) + + (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) + + (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz); + + float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz; + + score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius); + } + else + { + // each live_triangles entry is >= 1 since it includes the current triangle we're processing + score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3); + } + + // note that topology-based priority is always more important than the score + // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost + if (extra < best_extra || score < best_score) + { + best_triangle = triangle; + best_extra = extra; + best_score = score; + } + } + } + + if (out_extra) + *out_extra = best_extra; + + return best_triangle; +} + +struct KDNode +{ + union + { + float split; + unsigned int index; + }; + + // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point) + // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children + unsigned int axis : 2; + unsigned int children : 30; +}; + +static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot) +{ + size_t m = 0; + + // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot + for (size_t i = 0; i < count; ++i) + { + float v = points[indices[i] * stride + axis]; + + // swap(m, i) unconditionally + unsigned int t = indices[m]; + indices[m] = indices[i]; + indices[i] = t; + + // when v >= pivot, we swap i with m without advancing it, preserving invariants + m += v < pivot; + } + + return m; +} + +static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count) +{ + assert(offset + count <= node_count); + (void)node_count; + + KDNode& result = nodes[offset]; + + result.index = indices[0]; + result.axis = 3; + result.children = unsigned(count - 1); + + // all remaining points are stored in nodes immediately following the leaf + for (size_t i = 1; i < count; ++i) + { + KDNode& tail = nodes[offset + i]; + + tail.index = indices[i]; + tail.axis = 3; + tail.children = ~0u >> 2; // bogus value to prevent misuse + } + + return offset + count; +} + +static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size) +{ + assert(count > 0); + assert(offset < node_count); + + if (count <= leaf_size) + return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); + + float mean[3] = {}; + float vars[3] = {}; + float runc = 1, runs = 1; + + // gather statistics on the points in the subtree using Welford's algorithm + for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc) + { + const float* point = points + indices[i] * stride; + + for (int k = 0; k < 3; ++k) + { + float delta = point[k] - mean[k]; + mean[k] += delta * runs; + vars[k] += delta * (point[k] - mean[k]); + } + } + + // split axis is one where the variance is largest + unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1 : 2; + + float split = mean[axis]; + size_t middle = kdtreePartition(indices, count, points, stride, axis, split); + + // when the partition is degenerate simply consolidate the points into a single node + if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2) + return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); + + KDNode& result = nodes[offset]; + + result.split = split; + result.axis = axis; + + // left subtree is right after our node + size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size); + + // distance to the right subtree is represented explicitly + result.children = unsigned(next_offset - offset - 1); + + return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); +} + +static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) +{ + const KDNode& node = nodes[root]; + + if (node.axis == 3) + { + // leaf + for (unsigned int i = 0; i <= node.children; ++i) + { + unsigned int index = nodes[root + i].index; + + if (emitted_flags[index]) + continue; + + const float* point = points + index * stride; + + float distance2 = + (point[0] - position[0]) * (point[0] - position[0]) + + (point[1] - position[1]) * (point[1] - position[1]) + + (point[2] - position[2]) * (point[2] - position[2]); + float distance = sqrtf(distance2); + + if (distance < limit) + { + result = index; + limit = distance; + } + } + } + else + { + // branch; we order recursion to process the node that search position is in first + float delta = position[node.axis] - node.split; + unsigned int first = (delta <= 0) ? 0 : node.children; + unsigned int second = first ^ node.children; + + kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); + + // only process the other node if it can have a match based on closest distance so far + if (fabsf(delta) <= limit) + kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit); + } +} + } // namespace meshopt size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles) { + using namespace meshopt; + assert(index_count % 3 == 0); - assert(max_vertices >= 3); - assert(max_triangles >= 1); + assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); + assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); + assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + + (void)kMeshletMaxVertices; + (void)kMeshletMaxTriangles; // meshlet construction is limited by max vertices and max triangles per meshlet // the worst case is that the input is an unindexed stream since this equally stresses both limits @@ -100,77 +532,181 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles; } -size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) { + using namespace meshopt; + assert(index_count % 3 == 0); - assert(max_vertices >= 3); - assert(max_triangles >= 1); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); + assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); + assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + + assert(cone_weight >= 0 && cone_weight <= 1); meshopt_Allocator allocator; - meshopt_Meshlet meshlet; - memset(&meshlet, 0, sizeof(meshlet)); + TriangleAdjacency2 adjacency = {}; + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); - assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0])); - assert(max_triangles <= sizeof(meshlet.indices) / 3); + unsigned int* live_triangles = allocator.allocate(vertex_count); + memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + + size_t face_count = index_count / 3; + + unsigned char* emitted_flags = allocator.allocate(face_count); + memset(emitted_flags, 0, face_count); + + // for each triangle, precompute centroid & normal to use for scoring + Cone* triangles = allocator.allocate(face_count); + float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); + + // assuming each meshlet is a square patch, expected radius is sqrt(expected area) + float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f; + float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f; + + // build a kd-tree for nearest neighbor lookup + unsigned int* kdindices = allocator.allocate(face_count); + for (size_t i = 0; i < face_count; ++i) + kdindices[i] = unsigned(i); + + KDNode* nodes = allocator.allocate(face_count * 2); + kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8); // index of the vertex in the meshlet, 0xff if the vertex isn't used unsigned char* used = allocator.allocate(vertex_count); memset(used, -1, vertex_count); - size_t offset = 0; + meshopt_Meshlet meshlet = {}; + size_t meshlet_offset = 0; + + Cone meshlet_cone_acc = {}; + + for (;;) + { + Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); + + unsigned int best_extra = 0; + unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra); + + // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring + if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) + { + best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL); + } + + // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity + if (best_triangle == ~0u) + { + float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz}; + unsigned int index = ~0u; + float limit = FLT_MAX; + + kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit); + + best_triangle = index; + } + + if (best_triangle == ~0u) + break; + + unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds + if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles)) + { + meshlet_offset++; + memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc)); + } + + live_triangles[a]--; + live_triangles[b]--; + live_triangles[c]--; + + // remove emitted triangle from adjacency data + // this makes sure that we spend less time traversing these lists on subsequent iterations + for (size_t k = 0; k < 3; ++k) + { + unsigned int index = indices[best_triangle * 3 + k]; + + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; + + for (size_t i = 0; i < neighbors_size; ++i) + { + unsigned int tri = neighbors[i]; + + if (tri == best_triangle) + { + neighbors[i] = neighbors[neighbors_size - 1]; + adjacency.counts[index]--; + break; + } + } + } + + // update aggregated meshlet cone data for scoring subsequent triangles + meshlet_cone_acc.px += triangles[best_triangle].px; + meshlet_cone_acc.py += triangles[best_triangle].py; + meshlet_cone_acc.pz += triangles[best_triangle].pz; + meshlet_cone_acc.nx += triangles[best_triangle].nx; + meshlet_cone_acc.ny += triangles[best_triangle].ny; + meshlet_cone_acc.nz += triangles[best_triangle].nz; + + emitted_flags[best_triangle] = 1; + } + + if (meshlet.triangle_count) + { + finishMeshlet(meshlet, meshlet_triangles); + + meshlets[meshlet_offset++] = meshlet; + } + + assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + return meshlet_offset; +} + +size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + + assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); + assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); + assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + + meshopt_Allocator allocator; + + // index of the vertex in the meshlet, 0xff if the vertex isn't used + unsigned char* used = allocator.allocate(vertex_count); + memset(used, -1, vertex_count); + + meshopt_Meshlet meshlet = {}; + size_t meshlet_offset = 0; for (size_t i = 0; i < index_count; i += 3) { unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; assert(a < vertex_count && b < vertex_count && c < vertex_count); - unsigned char& av = used[a]; - unsigned char& bv = used[b]; - unsigned char& cv = used[c]; - - unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); - - if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) - { - destination[offset++] = meshlet; - - for (size_t j = 0; j < meshlet.vertex_count; ++j) - used[meshlet.vertices[j]] = 0xff; - - memset(&meshlet, 0, sizeof(meshlet)); - } - - if (av == 0xff) - { - av = meshlet.vertex_count; - meshlet.vertices[meshlet.vertex_count++] = a; - } - - if (bv == 0xff) - { - bv = meshlet.vertex_count; - meshlet.vertices[meshlet.vertex_count++] = b; - } - - if (cv == 0xff) - { - cv = meshlet.vertex_count; - meshlet.vertices[meshlet.vertex_count++] = c; - } - - meshlet.indices[meshlet.triangle_count][0] = av; - meshlet.indices[meshlet.triangle_count][1] = bv; - meshlet.indices[meshlet.triangle_count][2] = cv; - meshlet.triangle_count++; + // appends triangle to the meshlet and writes previous meshlet to the output if full + meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles); } if (meshlet.triangle_count) - destination[offset++] = meshlet; + { + finishMeshlet(meshlet, meshlet_triangles); - assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + meshlets[meshlet_offset++] = meshlet; + } - return offset; + assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + return meshlet_offset; } meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) @@ -178,18 +714,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(index_count / 3 <= kMeshletMaxTriangles); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); - assert(index_count / 3 <= 256); - (void)vertex_count; size_t vertex_stride_float = vertex_positions_stride / sizeof(float); // compute triangle normals and gather triangle corners - float normals[256][3]; - float corners[256][3][3]; + float normals[kMeshletMaxTriangles][3]; + float corners[kMeshletMaxTriangles][3][3]; size_t triangles = 0; for (size_t i = 0; i < index_count; i += 3) @@ -327,25 +862,23 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t return bounds; } -meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + using namespace meshopt; + + assert(triangle_count <= kMeshletMaxTriangles); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); - unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])]; + unsigned int indices[kMeshletMaxTriangles * 3]; - for (size_t i = 0; i < meshlet->triangle_count; ++i) + for (size_t i = 0; i < triangle_count * 3; ++i) { - unsigned int a = meshlet->vertices[meshlet->indices[i][0]]; - unsigned int b = meshlet->vertices[meshlet->indices[i][1]]; - unsigned int c = meshlet->vertices[meshlet->indices[i][2]]; + unsigned int index = meshlet_vertices[meshlet_triangles[i]]; + assert(index < vertex_count); - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - indices[i * 3 + 0] = a; - indices[i * 3 + 1] = b; - indices[i * 3 + 2] = c; + indices[i] = index; } - return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); + return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); } diff --git a/Source/ThirdParty/meshoptimizer/indexcodec.cpp b/Source/ThirdParty/meshoptimizer/indexcodec.cpp index eeb541e5b..4cc2fea63 100644 --- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp +++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp @@ -4,14 +4,6 @@ #include #include -#ifndef TRACE -#define TRACE 0 -#endif - -#if TRACE -#include -#endif - // This work is based on: // Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013 // Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014 @@ -21,7 +13,7 @@ namespace meshopt const unsigned char kIndexHeader = 0xe0; const unsigned char kSequenceHeader = 0xd0; -static int gEncodeIndexVersion = 0; +static int gEncodeIndexVersion = 1; typedef unsigned int VertexFifo[16]; typedef unsigned int EdgeFifo[16][2]; @@ -116,7 +108,7 @@ static unsigned int decodeVByte(const unsigned char*& data) for (int i = 0; i < 4; ++i) { unsigned char group = *data++; - result |= (group & 127) << shift; + result |= unsigned(group & 127) << shift; shift += 7; if (group < 128) @@ -167,38 +159,6 @@ static void writeTriangle(void* destination, size_t offset, size_t index_size, u } } -#if TRACE -static size_t sortTop16(unsigned char dest[16], size_t stats[256]) -{ - size_t destsize = 0; - - for (size_t i = 0; i < 256; ++i) - { - size_t j = 0; - for (; j < destsize; ++j) - { - if (stats[i] >= stats[dest[j]]) - { - if (destsize < 16) - destsize++; - - memmove(&dest[j + 1], &dest[j], destsize - 1 - j); - dest[j] = (unsigned char)i; - break; - } - } - - if (j == destsize && destsize < 16) - { - dest[destsize] = (unsigned char)i; - destsize++; - } - } - - return destsize; -} -#endif - } // namespace meshopt size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count) @@ -207,11 +167,6 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons assert(index_count % 3 == 0); -#if TRACE - size_t codestats[256] = {}; - size_t codeauxstats[256] = {}; -#endif - // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table if (buffer_size < 1 + index_count / 3 + 16) return 0; @@ -275,10 +230,6 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons *code++ = (unsigned char)((fe << 4) | fec); -#if TRACE - codestats[code[-1]]++; -#endif - // note that we need to update the last index since free indices are delta-encoded if (fec == 15) encodeIndex(data, c, last), last = c; @@ -334,11 +285,6 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons *data++ = codeaux; } -#if TRACE - codestats[code[-1]]++; - codeauxstats[codeaux]++; -#endif - // note that we need to update the last index since free indices are delta-encoded if (fea == 15) encodeIndex(data, a, last), last = a; @@ -387,30 +333,6 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons assert(data >= buffer + index_count / 3 + 16); assert(data <= buffer + buffer_size); -#if TRACE - unsigned char codetop[16], codeauxtop[16]; - size_t codetopsize = sortTop16(codetop, codestats); - size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats); - - size_t sumcode = 0, sumcodeaux = 0; - for (size_t i = 0; i < 256; ++i) - sumcode += codestats[i], sumcodeaux += codeauxstats[i]; - - size_t acccode = 0, acccodeaux = 0; - - printf("code\t\t\t\t\tcodeaux\n"); - - for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i) - { - acccode += codestats[codetop[i]]; - acccodeaux += codeauxstats[codeauxtop[i]]; - - printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n", - int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100, - int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100); - } -#endif - return data - buffer; } diff --git a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp index aa4a30efa..f6728345a 100644 --- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp +++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp @@ -4,6 +4,8 @@ #include #include +// This work is based on: +// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010 namespace meshopt { @@ -83,10 +85,49 @@ struct VertexStreamHasher } }; +struct EdgeHasher +{ + const unsigned int* remap; + + size_t hash(unsigned long long edge) const + { + unsigned int e0 = unsigned(edge >> 32); + unsigned int e1 = unsigned(edge); + + unsigned int h1 = remap[e0]; + unsigned int h2 = remap[e1]; + + const unsigned int m = 0x5bd1e995; + + // MurmurHash64B finalizer + h1 ^= h2 >> 18; + h1 *= m; + h2 ^= h1 >> 22; + h2 *= m; + h1 ^= h2 >> 17; + h1 *= m; + h2 ^= h1 >> 19; + h2 *= m; + + return h2; + } + + bool equal(unsigned long long lhs, unsigned long long rhs) const + { + unsigned int l0 = unsigned(lhs >> 32); + unsigned int l1 = unsigned(lhs); + + unsigned int r0 = unsigned(rhs >> 32); + unsigned int r1 = unsigned(rhs); + + return remap[l0] == remap[r0] && remap[l1] == remap[r1]; + } +}; + static size_t hashBuckets(size_t count) { size_t buckets = 1; - while (buckets < count) + while (buckets < count + count / 4) buckets *= 2; return buckets; @@ -116,7 +157,43 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c } assert(false && "Hash table is full"); // unreachable - return 0; + return NULL; +} + +static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator) +{ + VertexHasher vertex_hasher = {reinterpret_cast(vertex_positions), 3 * sizeof(float), vertex_positions_stride}; + + size_t vertex_table_size = hashBuckets(vertex_count); + unsigned int* vertex_table = allocator.allocate(vertex_table_size); + memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int index = unsigned(i); + unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + allocator.deallocate(vertex_table); +} + +template +static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) +{ + size_t block_size = BlockSize == 0 ? vertex_size : BlockSize; + assert(block_size == vertex_size); + + for (size_t i = 0; i < vertex_count; ++i) + if (remap[i] != ~0u) + { + assert(remap[i] < vertex_count); + memcpy(static_cast(destination) + remap[i] * block_size, static_cast(vertices) + i * block_size, block_size); + } } } // namespace meshopt @@ -126,7 +203,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int using namespace meshopt; assert(indices || index_count == vertex_count); - assert(index_count % 3 == 0); + assert(!indices || index_count % 3 == 0); assert(vertex_size > 0 && vertex_size <= 256); meshopt_Allocator allocator; @@ -227,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) { + using namespace meshopt; + assert(vertex_size > 0 && vertex_size <= 256); meshopt_Allocator allocator; @@ -239,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v vertices = vertices_copy; } - for (size_t i = 0; i < vertex_count; ++i) + // specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic + switch (vertex_size) { - if (remap[i] != ~0u) - { - assert(remap[i] < vertex_count); + case 4: + return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap); - memcpy(static_cast(destination) + remap[i] * vertex_size, static_cast(vertices) + i * vertex_size, vertex_size); - } + case 8: + return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap); + + case 12: + return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap); + + case 16: + return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap); + + default: + return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap); } } @@ -345,3 +433,146 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns destination[i] = remap[index]; } } + +void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + static const int next[4] = {1, 2, 0, 1}; + + // build position remap: for each vertex, which other (canonical) vertex does it map to? + unsigned int* remap = allocator.allocate(vertex_count); + buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator); + + // build edge set; this stores all triangle edges but we can look these up by any other wedge + EdgeHasher edge_hasher = {remap}; + + size_t edge_table_size = hashBuckets(index_count); + unsigned long long* edge_table = allocator.allocate(edge_table_size); + unsigned int* edge_vertex_table = allocator.allocate(edge_table_size); + + memset(edge_table, -1, edge_table_size * sizeof(unsigned long long)); + memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; i += 3) + { + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + unsigned int i2 = indices[i + next[e + 1]]; + assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count); + + unsigned long long edge = ((unsigned long long)i0 << 32) | i1; + unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull); + + if (*entry == ~0ull) + { + *entry = edge; + + // store vertex opposite to the edge + edge_vertex_table[entry - edge_table] = i2; + } + } + } + + // build resulting index buffer: 6 indices for each input triangle + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int patch[6]; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + assert(i0 < vertex_count && i1 < vertex_count); + + // note: this refers to the opposite edge! + unsigned long long edge = ((unsigned long long)i1 << 32) | i0; + unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull); + + patch[e * 2 + 0] = i0; + patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table]; + } + + memcpy(destination + i * 2, patch, sizeof(patch)); + } +} + +void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + static const int next[3] = {1, 2, 0}; + + // build position remap: for each vertex, which other (canonical) vertex does it map to? + unsigned int* remap = allocator.allocate(vertex_count); + buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator); + + // build edge set; this stores all triangle edges but we can look these up by any other wedge + EdgeHasher edge_hasher = {remap}; + + size_t edge_table_size = hashBuckets(index_count); + unsigned long long* edge_table = allocator.allocate(edge_table_size); + memset(edge_table, -1, edge_table_size * sizeof(unsigned long long)); + + for (size_t i = 0; i < index_count; i += 3) + { + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + assert(i0 < vertex_count && i1 < vertex_count); + + unsigned long long edge = ((unsigned long long)i0 << 32) | i1; + unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull); + + if (*entry == ~0ull) + *entry = edge; + } + } + + // build resulting index buffer: 12 indices for each input triangle + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int patch[12]; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + assert(i0 < vertex_count && i1 < vertex_count); + + // note: this refers to the opposite edge! + unsigned long long edge = ((unsigned long long)i1 << 32) | i0; + unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull); + + // use the same edge if opposite edge doesn't exist (border) + oppe = (oppe == ~0ull) ? edge : oppe; + + // triangle index (0, 1, 2) + patch[e] = i0; + + // opposite edge (3, 4; 5, 6; 7, 8) + patch[3 + e * 2 + 0] = unsigned(oppe); + patch[3 + e * 2 + 1] = unsigned(oppe >> 32); + + // dominant vertex (9, 10, 11) + patch[9 + e] = remap[i0]; + } + + memcpy(destination + i * 4, patch, sizeof(patch)); + } +} diff --git a/Source/ThirdParty/meshoptimizer/meshoptimizer.h b/Source/ThirdParty/meshoptimizer/meshoptimizer.h index cb030ea29..dbafd4e6e 100644 --- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h +++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h @@ -1,7 +1,7 @@ /** - * meshoptimizer - version 0.14 + * meshoptimizer - version 0.20 * - * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Copyright (C) 2016-2023, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer * * This library is distributed under the MIT License. See notice at the end of this file. @@ -12,13 +12,22 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 140 +#define MESHOPTIMIZER_VERSION 200 /* 0.20 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API #define MESHOPTIMIZER_API #endif +/* Set the calling-convention for alloc/dealloc function pointers */ +#ifndef MESHOPTIMIZER_ALLOC_CALLCONV +#ifdef _MSC_VER +#define MESHOPTIMIZER_ALLOC_CALLCONV __cdecl +#else +#define MESHOPTIMIZER_ALLOC_CALLCONV +#endif +#endif + /* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */ #define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API @@ -28,8 +37,8 @@ extern "C" { #endif /** - * Vertex attribute stream, similar to glVertexPointer - * Each element takes size bytes, with stride controlling the spacing between successive elements. + * Vertex attribute stream + * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size). */ struct meshopt_Stream { @@ -42,6 +51,7 @@ struct meshopt_Stream * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence. * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. + * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized. * * destination must contain enough space for the resulting remap table (vertex_count elements) * indices can be NULL if the input is unindexed @@ -53,9 +63,11 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence. * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream. + * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized. * * destination must contain enough space for the resulting remap table (vertex_count elements) * indices can be NULL if the input is unindexed + * stream_count must be <= 16 */ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); @@ -79,6 +91,7 @@ MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer. * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering. + * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized. * * destination must contain enough space for the resulting index buffer (index_count elements) */ @@ -88,11 +101,42 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer. * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering. + * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized. * * destination must contain enough space for the resulting index buffer (index_count elements) + * stream_count must be <= 16 */ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); +/** + * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology + * Each triangle is converted into a 6-vertex patch with the following layout: + * - 0, 2, 4: original triangle vertices + * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40 + * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY. + * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering. + * + * destination must contain enough space for the resulting index buffer (index_count*2 elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement + * Each triangle is converted into a 12-vertex patch with the following layout: + * - 0, 1, 2: original triangle vertices + * - 3, 4: opposing edge for edge 0, 1 + * - 5, 6: opposing edge for edge 1, 2 + * - 7, 8: opposing edge for edge 2, 0 + * - 9, 10, 11: dominant vertices for corners 0, 1, 2 + * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping. + * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details. + * + * destination must contain enough space for the resulting index buffer (index_count*4 elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + /** * Vertex transform cache optimizer * Reorders indices to reduce the number of GPU vertex shader invocations @@ -129,7 +173,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination * * destination must contain enough space for the resulting index buffer (index_count elements) * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently */ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold); @@ -168,10 +212,10 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count); /** - * Experimental: Set index encoder format version + * Set index encoder format version * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+) */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version); +MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); /** * Index buffer decoder @@ -184,15 +228,15 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version); MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); /** - * Experimental: Index sequence encoder + * Index sequence encoder * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original. * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better. * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space * * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size) */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count); -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count); +MESHOPTIMIZER_API size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count); +MESHOPTIMIZER_API size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count); /** * Index sequence decoder @@ -202,13 +246,14 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_ * * destination must contain enough space for the resulting index sequence (index_count elements) */ -MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); +MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); /** * Vertex buffer encoder * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original. * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream. + * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized. * * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) */ @@ -216,10 +261,10 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); /** - * Experimental: Set vertex encoder format version + * Set vertex encoder format version * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version); +MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); /** * Vertex buffer decoder @@ -234,7 +279,6 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte /** * Vertex buffer filters * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place. - * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation. * * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f. * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. @@ -245,12 +289,51 @@ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t verte * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); /** - * Experimental: Mesh simplifier + * Vertex buffer filter encoders + * These functions can be used to encode data in a format that meshopt_decodeFilter can decode + * + * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. + * Input data must contain 4 floats for every vector (count*4 total). + * + * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding. + * Each component is stored as an 16-bit integer; stride must be equal to 8. + * Input data must contain 4 floats for every quaternion (count*4 total). + * + * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24). + * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4. + * Input data must contain stride/4 floats for every vector (count*stride/4 total). + */ +enum meshopt_EncodeExpMode +{ + /* When encoding exponents, use separate values for each component (maximum quality) */ + meshopt_EncodeExpSeparate, + /* When encoding exponents, use shared value for all components of each vector (better compression) */ + meshopt_EncodeExpSharedVector, + /* When encoding exponents, use shared value for each component of all vectors (best compression) */ + meshopt_EncodeExpSharedComponent, +}; + +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); + +/** + * Simplification options + */ +enum +{ + /* Do not move vertices that are located on the topological border (vertices on triangle edges that don't have a paired triangle). Useful for simplifying portions of the larger mesh. */ + meshopt_SimplifyLockBorder = 1 << 0, +}; + +/** + * Mesh simplifier * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification. @@ -258,23 +341,40 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t ver * The resulting index buffer references vertices from the original vertex buffer. * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * - * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); +MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error); + +/** + * Experimental: Mesh simplifier with attribute metric + * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details. + * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes. + * + * vertex_attributes should have attribute_count floats for each vertex + * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range. + * attribute_count must be <= 16 + * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** * Experimental: Mesh simplifier (sloppy) - * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance - * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count. + * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance + * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error. * Returns the number of indices after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * - * destination must contain enough space for the target index buffer - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); /** * Experimental: Point cloud simplifier @@ -283,10 +383,19 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati * The resulting index buffer references vertices from the original vertex buffer. * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * - * destination must contain enough space for the target index buffer - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * destination must contain enough space for the target index buffer (target_vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count); + +/** + * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents + * + * Absolute error must be *divided* by the scaling factor before passing it to meshopt_simplify as target_error + * Relative error returned by meshopt_simplify via result_error must be *multiplied* by the scaling factor to get absolute error. + */ +MESHOPTIMIZER_API float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Mesh stripifier @@ -338,7 +447,7 @@ struct meshopt_OverdrawStatistics * Returns overdraw statistics using a software rasterizer * Results may not match actual GPU performance * - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -357,23 +466,32 @@ MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetc struct meshopt_Meshlet { - unsigned int vertices[64]; - unsigned char indices[126][3]; - unsigned char triangle_count; - unsigned char vertex_count; + /* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */ + unsigned int vertex_offset; + unsigned int triangle_offset; + + /* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */ + unsigned int vertex_count; + unsigned int triangle_count; }; /** - * Experimental: Meshlet builder + * Meshlet builder * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers. - * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first. + * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters. + * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first. * - * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound - * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126) + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound + * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices + * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512) + * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); +MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); struct meshopt_Bounds { @@ -392,13 +510,13 @@ struct meshopt_Bounds }; /** - * Experimental: Cluster bounds generator + * Cluster bounds generator * Creates bounding volumes that can be used for frustum, backface and occlusion culling. * * For backface culling with orthographic projection, use the following formula to reject backfacing clusters: * dot(view, cone_axis) >= cone_cutoff * - * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff: + * For perspective projection, you can use the formula that needs cone apex in addition to axis & cutoff: * dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff * * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead: @@ -407,29 +525,31 @@ struct meshopt_Bounds * dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius * * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere - * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable. + * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable (for derivation see + * Real-Time Rendering 4th Edition, section 19.3). * - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer - * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size) */ -MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); -MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** - * Experimental: Spatial sorter + * Spatial sorter * Generates a remap table that can be used to reorder points for spatial locality. * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer. * * destination must contain enough space for the resulting remap table (vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Experimental: Spatial sorter * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache. * * destination must contain enough space for the resulting index buffer (index_count elements) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -439,7 +559,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti * Note that all algorithms only allocate memory for temporary use. * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. */ -MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*)); +MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*)); #ifdef __cplusplus } /* extern "C" */ @@ -462,19 +582,25 @@ inline int meshopt_quantizeUnorm(float v, int N); inline int meshopt_quantizeSnorm(float v, int N); /** - * Quantize a float into half-precision floating point value + * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest * Representable magnitude range: [6e-5; 65504] * Maximum relative reconstruction error: 5e-4 */ -inline unsigned short meshopt_quantizeHalf(float v); +MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v); /** - * Quantize a float into a floating point value with a limited number of significant mantissa bits + * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest * Assumes N is in a valid mantissa precision range, which is 1..23 */ -inline float meshopt_quantizeFloat(float v, int N); +MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); + +/** + * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value + * Preserves Inf/NaN, flushes denormals to zero + */ +MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h); #endif /** @@ -497,6 +623,10 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, template inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count); template +inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +template +inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +template inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count); template inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count); @@ -517,9 +647,11 @@ inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_s template inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); template -inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); template -inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count); +inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); +template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL); template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); template @@ -531,7 +663,9 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size template inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); template -inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); +inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); +template +inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template @@ -561,50 +695,6 @@ inline int meshopt_quantizeSnorm(float v, int N) return int(v * scale + round); } - -inline unsigned short meshopt_quantizeHalf(float v) -{ - union { float f; unsigned int ui; } u = {v}; - unsigned int ui = u.ui; - - int s = (ui >> 16) & 0x8000; - int em = ui & 0x7fffffff; - - /* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */ - int h = (em - (112 << 23) + (1 << 12)) >> 13; - - /* underflow: flush to zero; 113 encodes exponent -14 */ - h = (em < (113 << 23)) ? 0 : h; - - /* overflow: infinity; 143 encodes exponent 16 */ - h = (em >= (143 << 23)) ? 0x7c00 : h; - - /* NaN; note that we convert all types of NaN to qNaN */ - h = (em > (255 << 23)) ? 0x7e00 : h; - - return (unsigned short)(s | h); -} - -inline float meshopt_quantizeFloat(float v, int N) -{ - union { float f; unsigned int ui; } u = {v}; - unsigned int ui = u.ui; - - const int mask = (1 << (23 - N)) - 1; - const int round = (1 << (23 - N)) >> 1; - - int e = ui & 0x7f800000; - unsigned int rui = (ui + round) & ~mask; - - /* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */ - ui = e == 0x7f800000 ? ui : rui; - - /* flush denormals to zero */ - ui = e == 0 ? 0 : ui; - - u.ui = ui; - return u.f; -} #endif /* Internal implementation helpers */ @@ -615,8 +705,8 @@ public: template struct StorageT { - static void* (*allocate)(size_t); - static void (*deallocate)(void*); + static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t); + static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*); }; typedef StorageT Storage; @@ -641,14 +731,21 @@ public: return result; } + void deallocate(void* ptr) + { + assert(count > 0 && blocks[count - 1] == ptr); + Storage::deallocate(ptr); + count--; + } + private: - void* blocks[16]; + void* blocks[24]; size_t count; }; // This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker -template void* (*meshopt_Allocator::StorageT::allocate)(size_t) = operator new; -template void (*meshopt_Allocator::StorageT::deallocate)(void*) = operator delete; +template void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::allocate)(size_t) = operator new; +template void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT::deallocate)(void*) = operator delete; #endif /* Inline implementation for C++ templated wrappers */ @@ -665,7 +762,7 @@ struct meshopt_IndexAdapter meshopt_IndexAdapter(T* result_, const T* input, size_t count_) : result(result_) - , data(0) + , data(NULL) , count(count_) { size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int); @@ -705,33 +802,33 @@ struct meshopt_IndexAdapter template inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) { - meshopt_IndexAdapter in(0, indices, indices ? index_count : 0); + meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0); - return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size); + return meshopt_generateVertexRemap(destination, indices ? in.data : NULL, index_count, vertices, vertex_count, vertex_size); } template inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count) { - meshopt_IndexAdapter in(0, indices, indices ? index_count : 0); + meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0); - return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count); + return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count); } template inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap) { - meshopt_IndexAdapter in(0, indices, indices ? index_count : 0); + meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0); meshopt_IndexAdapter out(destination, 0, index_count); - meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap); + meshopt_remapIndexBuffer(out.data, indices ? in.data : NULL, index_count, remap); } template inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride); } @@ -739,17 +836,35 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, template inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count); } +template +inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count * 2); + + meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); +} + +template +inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count * 4); + + meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); +} + template inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count); } @@ -757,8 +872,8 @@ inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t template inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count); } @@ -766,8 +881,8 @@ inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, s template inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size); } @@ -775,8 +890,8 @@ inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, si template inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold); } @@ -784,7 +899,7 @@ inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t in template inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count); } @@ -800,7 +915,7 @@ inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t template inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count); } @@ -817,7 +932,7 @@ inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const u template inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count); } @@ -832,28 +947,37 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const } template -inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); - return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error); + return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error); } template -inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count) +inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, target_index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); - return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count); + return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, result_error); +} + +template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error); } template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, (index_count / 3) * 5); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, (index_count / 3) * 5); return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index)); } @@ -861,8 +985,8 @@ inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_co template inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, (index_count - 2) * 3); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, (index_count - 2) * 3); return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index)); } @@ -870,7 +994,7 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_ template inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size); } @@ -878,7 +1002,7 @@ inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices template inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } @@ -886,23 +1010,31 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size template inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); } template -inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles); + return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight); +} + +template +inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles); } template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - meshopt_IndexAdapter in(0, indices, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } @@ -910,15 +1042,15 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde template inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - meshopt_IndexAdapter in(0, indices, index_count); - meshopt_IndexAdapter out(destination, 0, index_count); + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } #endif /** - * Copyright (c) 2016-2020 Arseny Kapoulkine + * Copyright (c) 2016-2023 Arseny Kapoulkine * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp b/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp index 8d5859ba3..8b6f25413 100644 --- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp +++ b/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp @@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; diff --git a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp index 143656ed7..cc22dbcff 100644 --- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp +++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp @@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; diff --git a/Source/ThirdParty/meshoptimizer/quantization.cpp b/Source/ThirdParty/meshoptimizer/quantization.cpp new file mode 100644 index 000000000..09a314d60 --- /dev/null +++ b/Source/ThirdParty/meshoptimizer/quantization.cpp @@ -0,0 +1,70 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include + +unsigned short meshopt_quantizeHalf(float v) +{ + union { float f; unsigned int ui; } u = {v}; + unsigned int ui = u.ui; + + int s = (ui >> 16) & 0x8000; + int em = ui & 0x7fffffff; + + // bias exponent and round to nearest; 112 is relative exponent bias (127-15) + int h = (em - (112 << 23) + (1 << 12)) >> 13; + + // underflow: flush to zero; 113 encodes exponent -14 + h = (em < (113 << 23)) ? 0 : h; + + // overflow: infinity; 143 encodes exponent 16 + h = (em >= (143 << 23)) ? 0x7c00 : h; + + // NaN; note that we convert all types of NaN to qNaN + h = (em > (255 << 23)) ? 0x7e00 : h; + + return (unsigned short)(s | h); +} + +float meshopt_quantizeFloat(float v, int N) +{ + assert(N >= 0 && N <= 23); + + union { float f; unsigned int ui; } u = {v}; + unsigned int ui = u.ui; + + const int mask = (1 << (23 - N)) - 1; + const int round = (1 << (23 - N)) >> 1; + + int e = ui & 0x7f800000; + unsigned int rui = (ui + round) & ~mask; + + // round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 + ui = e == 0x7f800000 ? ui : rui; + + // flush denormals to zero + ui = e == 0 ? 0 : ui; + + u.ui = ui; + return u.f; +} + +float meshopt_dequantizeHalf(unsigned short h) +{ + unsigned int s = unsigned(h & 0x8000) << 16; + int em = h & 0x7fff; + + // bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15) + int r = (em + (112 << 10)) << 13; + + // denormal: flush to zero + r = (em < (1 << 10)) ? 0 : r; + + // infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases + // 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255 + r += (em >= (31 << 10)) ? (112 << 23) : 0; + + union { float f; unsigned int ui; } u; + u.ui = s | r; + return u.f; +} diff --git a/Source/ThirdParty/meshoptimizer/simplifier.cpp b/Source/ThirdParty/meshoptimizer/simplifier.cpp index dd0ff9b07..5ba857007 100644 --- a/Source/ThirdParty/meshoptimizer/simplifier.cpp +++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp @@ -14,39 +14,55 @@ #include #endif +#if TRACE +#define TRACESTATS(i) stats[i]++; +#else +#define TRACESTATS(i) (void)0 +#endif + // This work is based on: // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997 // Michael Garland. Quadric-based polygonal surface simplification. 1999 // Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000 // Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 // Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019 +// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999 namespace meshopt { struct EdgeAdjacency { - unsigned int* counts; + struct Edge + { + unsigned int next; + unsigned int prev; + }; + unsigned int* offsets; - unsigned int* data; + Edge* data; }; -static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +static void prepareEdgeAdjacency(EdgeAdjacency& adjacency, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + adjacency.offsets = allocator.allocate(vertex_count + 1); + adjacency.data = allocator.allocate(index_count); +} + +static void updateEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* remap) { size_t face_count = index_count / 3; - - // allocate arrays - adjacency.counts = allocator.allocate(vertex_count); - adjacency.offsets = allocator.allocate(vertex_count); - adjacency.data = allocator.allocate(index_count); + unsigned int* offsets = adjacency.offsets + 1; + EdgeAdjacency::Edge* data = adjacency.data; // fill edge counts - memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); + memset(offsets, 0, vertex_count * sizeof(unsigned int)); for (size_t i = 0; i < index_count; ++i) { - assert(indices[i] < vertex_count); + unsigned int v = remap ? remap[indices[i]] : indices[i]; + assert(v < vertex_count); - adjacency.counts[indices[i]]++; + offsets[v]++; } // fill offset table @@ -54,8 +70,9 @@ static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* ind for (size_t i = 0; i < vertex_count; ++i) { - adjacency.offsets[i] = offset; - offset += adjacency.counts[i]; + unsigned int count = offsets[i]; + offsets[i] = offset; + offset += count; } assert(offset == index_count); @@ -65,18 +82,29 @@ static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* ind { unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; - adjacency.data[adjacency.offsets[a]++] = b; - adjacency.data[adjacency.offsets[b]++] = c; - adjacency.data[adjacency.offsets[c]++] = a; + if (remap) + { + a = remap[a]; + b = remap[b]; + c = remap[c]; + } + + data[offsets[a]].next = b; + data[offsets[a]].prev = c; + offsets[a]++; + + data[offsets[b]].next = c; + data[offsets[b]].prev = a; + offsets[b]++; + + data[offsets[c]].next = a; + data[offsets[c]].prev = b; + offsets[c]++; } - // fix offsets that have been disturbed by the previous pass - for (size_t i = 0; i < vertex_count; ++i) - { - assert(adjacency.offsets[i] >= adjacency.counts[i]); - - adjacency.offsets[i] -= adjacency.counts[i]; - } + // finalize offsets + adjacency.offsets[0] = 0; + assert(adjacency.offsets[vertex_count] == index_count); } struct PositionHasher @@ -86,26 +114,15 @@ struct PositionHasher size_t hash(unsigned int index) const { - // MurmurHash2 - const unsigned int m = 0x5bd1e995; - const int r = 24; - - unsigned int h = 0; const unsigned int* key = reinterpret_cast(vertex_positions + index * vertex_stride_float); - for (size_t i = 0; i < 3; ++i) - { - unsigned int k = key[i]; + // scramble bits to make sure that integer coordinates have entropy in lower bits + unsigned int x = key[0] ^ (key[0] >> 17); + unsigned int y = key[1] ^ (key[1] >> 17); + unsigned int z = key[2] ^ (key[2] >> 17); - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - } - - return h; + // Optimized Spatial Hashing for Collision Detection of Deformable Objects + return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791); } bool equal(unsigned int lhs, unsigned int rhs) const @@ -117,7 +134,7 @@ struct PositionHasher static size_t hashBuckets2(size_t count) { size_t buckets = 1; - while (buckets < count) + while (buckets < count + count / 4) buckets *= 2; return buckets; @@ -147,7 +164,7 @@ static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, } assert(false && "Hash table is full"); // unreachable - return 0; + return NULL; } static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator) @@ -184,6 +201,8 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f wedge[i] = wedge[r]; wedge[r] = unsigned(i); } + + allocator.deallocate(table); } enum VertexKind @@ -223,60 +242,56 @@ const unsigned char kHasOpposite[Kind_Count][Kind_Count] = { static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b) { - unsigned int count = adjacency.counts[a]; - const unsigned int* data = adjacency.data + adjacency.offsets[a]; + unsigned int count = adjacency.offsets[a + 1] - adjacency.offsets[a]; + const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[a]; for (size_t i = 0; i < count; ++i) - if (data[i] == b) + if (edges[i].next == b) return true; return false; } -static unsigned int findWedgeEdge(const EdgeAdjacency& adjacency, const unsigned int* wedge, unsigned int a, unsigned int b) +static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, unsigned int options) { - unsigned int v = a; + memset(loop, -1, vertex_count * sizeof(unsigned int)); + memset(loopback, -1, vertex_count * sizeof(unsigned int)); - do - { - if (hasEdge(adjacency, v, b)) - return v; + // incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1 + // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam + // but here it's okay to fill the data out for other types of vertices as well + unsigned int* openinc = loopback; + unsigned int* openout = loop; - v = wedge[v]; - } while (v != a); - - return ~0u; -} - -static size_t countOpenEdges(const EdgeAdjacency& adjacency, unsigned int vertex, unsigned int* last = 0) -{ - size_t result = 0; - - unsigned int count = adjacency.counts[vertex]; - const unsigned int* data = adjacency.data + adjacency.offsets[vertex]; - - for (size_t i = 0; i < count; ++i) - if (!hasEdge(adjacency, data[i], vertex)) - { - result++; - - if (last) - *last = data[i]; - } - - return result; -} - -static void classifyVertices(unsigned char* result, unsigned int* loop, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge) -{ for (size_t i = 0; i < vertex_count; ++i) - loop[i] = ~0u; + { + unsigned int vertex = unsigned(i); + + unsigned int count = adjacency.offsets[vertex + 1] - adjacency.offsets[vertex]; + const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[vertex]; + + for (size_t j = 0; j < count; ++j) + { + unsigned int target = edges[j].next; + + if (target == vertex) + { + // degenerate triangles have two distinct edges instead of three, and the self edge + // is bi-directional by definition; this can break border/seam classification by "closing" + // the open edge from another triangle and falsely marking the vertex as manifold + // instead we mark the vertex as having >1 open edges which turns it into locked/complex + openinc[vertex] = openout[vertex] = vertex; + } + else if (!hasEdge(adjacency, target, vertex)) + { + openinc[target] = (openinc[target] == ~0u) ? vertex : target; + openout[vertex] = (openout[vertex] == ~0u) ? target : vertex; + } + } + } #if TRACE - size_t lockedstats[4] = {}; -#define TRACELOCKED(i) lockedstats[i]++; -#else -#define TRACELOCKED(i) (void)0 + size_t stats[4] = {}; #endif for (size_t i = 0; i < vertex_count; ++i) @@ -286,67 +301,57 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, size_t v if (wedge[i] == i) { // no attribute seam, need to check if it's manifold - unsigned int v = 0; - size_t edges = countOpenEdges(adjacency, unsigned(i), &v); + unsigned int openi = openinc[i], openo = openout[i]; // note: we classify any vertices with no open edges as manifold // this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold // it's unclear if this is a problem in practice - // also note that we classify vertices as border if they have *one* open edge, not two - // this is because we only have half-edges - so a border vertex would have one incoming and one outgoing edge - if (edges == 0) + if (openi == ~0u && openo == ~0u) { result[i] = Kind_Manifold; } - else if (edges == 1) + else if (openi != i && openo != i) { result[i] = Kind_Border; - loop[i] = v; } else { result[i] = Kind_Locked; - TRACELOCKED(0); + TRACESTATS(0); } } else if (wedge[wedge[i]] == i) { // attribute seam; need to distinguish between Seam and Locked - unsigned int a = 0; - size_t a_count = countOpenEdges(adjacency, unsigned(i), &a); - unsigned int b = 0; - size_t b_count = countOpenEdges(adjacency, wedge[i], &b); + unsigned int w = wedge[i]; + unsigned int openiv = openinc[i], openov = openout[i]; + unsigned int openiw = openinc[w], openow = openout[w]; // seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap - if (a_count == 1 && b_count == 1) + if (openiv != ~0u && openiv != i && openov != ~0u && openov != i && + openiw != ~0u && openiw != w && openow != ~0u && openow != w) { - unsigned int ao = findWedgeEdge(adjacency, wedge, a, wedge[i]); - unsigned int bo = findWedgeEdge(adjacency, wedge, b, unsigned(i)); - - if (ao != ~0u && bo != ~0u) + if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw]) { result[i] = Kind_Seam; - - loop[i] = a; - loop[wedge[i]] = b; } else { result[i] = Kind_Locked; - TRACELOCKED(1); + TRACESTATS(1); } } else { result[i] = Kind_Locked; - TRACELOCKED(2); + TRACESTATS(2); } } else { // more than one vertex maps to this one; we don't have classification available result[i] = Kind_Locked; - TRACELOCKED(3); + TRACESTATS(3); } } else @@ -357,9 +362,14 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, size_t v } } + if (options & meshopt_SimplifyLockBorder) + for (size_t i = 0; i < vertex_count; ++i) + if (result[i] == Kind_Border) + result[i] = Kind_Locked; + #if TRACE printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n", - int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3])); + int(stats[0]), int(stats[1]), int(stats[2]), int(stats[3])); #endif } @@ -368,7 +378,7 @@ struct Vector3 float x, y, z; }; -static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -379,9 +389,12 @@ static void rescalePositions(Vector3* result, const float* vertex_positions_data { const float* v = vertex_positions_data + i * vertex_stride_float; - result[i].x = v[0]; - result[i].y = v[1]; - result[i].z = v[2]; + if (result) + { + result[i].x = v[0]; + result[i].y = v[1]; + result[i].z = v[2]; + } for (int j = 0; j < 3; ++j) { @@ -398,30 +411,67 @@ static void rescalePositions(Vector3* result, const float* vertex_positions_data extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); - float scale = extent == 0 ? 0.f : 1.f / extent; + if (result) + { + float scale = extent == 0 ? 0.f : 1.f / extent; + + for (size_t i = 0; i < vertex_count; ++i) + { + result[i].x = (result[i].x - minv[0]) * scale; + result[i].y = (result[i].y - minv[1]) * scale; + result[i].z = (result[i].z - minv[2]) * scale; + } + } + + return extent; +} + +static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count) +{ + size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); for (size_t i = 0; i < vertex_count; ++i) { - result[i].x = (result[i].x - minv[0]) * scale; - result[i].y = (result[i].y - minv[1]) * scale; - result[i].z = (result[i].z - minv[2]) * scale; + for (size_t k = 0; k < attribute_count; ++k) + { + float a = vertex_attributes_data[i * vertex_attributes_stride_float + k]; + + result[i * attribute_count + k] = a * attribute_weights[k]; + } } } +static const size_t kMaxAttributes = 16; + struct Quadric { + // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c float a00, a11, a22; float a10, a20, a21; float b0, b1, b2, c; float w; }; +struct QuadricGrad +{ + // gx*x + gy*y + gz*z + gw + float gx, gy, gz, gw; +}; + +struct Reservoir +{ + float x, y, z; + float r, g, b; + float w; +}; + struct Collapse { unsigned int v0; unsigned int v1; - union { + union + { unsigned int bidi; float error; unsigned int errorui; @@ -457,6 +507,17 @@ static void quadricAdd(Quadric& Q, const Quadric& R) Q.w += R.w; } +static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count) +{ + for (size_t k = 0; k < attribute_count; ++k) + { + G[k].gx += R[k].gx; + G[k].gy += R[k].gy; + G[k].gz += R[k].gz; + G[k].gw += R[k].gw; + } +} + static float quadricError(const Quadric& Q, const Vector3& v) { float rx = Q.b0; @@ -485,6 +546,45 @@ static float quadricError(const Quadric& Q, const Vector3& v) return fabsf(r) * s; } +static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va) +{ + float rx = Q.b0; + float ry = Q.b1; + float rz = Q.b2; + + rx += Q.a10 * v.y; + ry += Q.a21 * v.z; + rz += Q.a20 * v.x; + + rx *= 2; + ry *= 2; + rz *= 2; + + rx += Q.a00 * v.x; + ry += Q.a11 * v.y; + rz += Q.a22 * v.z; + + float r = Q.c; + r += rx * v.x; + r += ry * v.y; + r += rz * v.z; + + // see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr + for (size_t k = 0; k < attribute_count; ++k) + { + float a = va[k]; + float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw; + + r += a * a * Q.w; + r -= 2 * a * g; + } + + // TODO: weight normalization is breaking attribute error somehow + float s = 1;// Q.w == 0.f ? 0.f : 1.f / Q.w; + + return fabsf(r) * s; +} + static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w) { float aw = a * w; @@ -505,22 +605,6 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo Q.w = w; } -static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w) -{ - // we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric - Q.a00 = w; - Q.a11 = w; - Q.a22 = w; - Q.a10 = 0.f; - Q.a20 = 0.f; - Q.a21 = 0.f; - Q.b0 = -2.f * x * w; - Q.b1 = -2.f * y * w; - Q.b2 = -2.f * z * w; - Q.c = (x * x + y * y + z * z) * w; - Q.w = w; -} - static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) { Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; @@ -555,6 +639,82 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3 quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight); } +static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count) +{ + // for each attribute we want to encode the following function into the quadric: + // (eval(pos) - attr)^2 + // where eval(pos) interpolates attribute across the triangle like so: + // eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw + // where gx/gy/gz/gw are gradients + Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; + Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; + + // weight is scaled linearly with edge length + Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x}; + float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z); + float w = sqrtf(area); // TODO this needs more experimentation + + // we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows: + // v = (d11 * d20 - d01 * d21) / denom + // w = (d00 * d21 - d01 * d20) / denom + // u = 1 - v - w + // here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj) + const Vector3& v0 = p10; + const Vector3& v1 = p20; + float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z; + float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; + float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z; + float denom = d00 * d11 - d01 * d01; + float denomr = denom == 0 ? 0.f : 1.f / denom; + + // precompute gradient factors + // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes + float gx1 = (d11 * v0.x - d01 * v1.x) * denomr; + float gx2 = (d00 * v1.x - d01 * v0.x) * denomr; + float gy1 = (d11 * v0.y - d01 * v1.y) * denomr; + float gy2 = (d00 * v1.y - d01 * v0.y) * denomr; + float gz1 = (d11 * v0.z - d01 * v1.z) * denomr; + float gz2 = (d00 * v1.z - d01 * v0.z) * denomr; + + memset(&Q, 0, sizeof(Quadric)); + + Q.w = w; + + for (size_t k = 0; k < attribute_count; ++k) + { + float a0 = va0[k], a1 = va1[k], a2 = va2[k]; + + // compute gradient of eval(pos) for x/y/z/w + // the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w + float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0); + float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0); + float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0); + float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz; + + // quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K + // since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields + Q.a00 += w * (gx * gx); + Q.a11 += w * (gy * gy); + Q.a22 += w * (gz * gz); + + Q.a10 += w * (gy * gx); + Q.a20 += w * (gz * gx); + Q.a21 += w * (gz * gy); + + Q.b0 += w * (gx * gw); + Q.b1 += w * (gy * gw); + Q.b2 += w * (gz * gw); + + Q.c += w * (gw * gw); + + // the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError + G[k].gx = w * gx; + G[k].gy = w * gy; + G[k].gz = w * gz; + G[k].gw = w * gw; + } +} + static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) { for (size_t i = 0; i < index_count; i += 3) @@ -572,11 +732,11 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic } } -static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop) +static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) { for (size_t i = 0; i < index_count; i += 3) { - static const int next[3] = {1, 2, 0}; + static const int next[4] = {1, 2, 0, 1}; for (int e = 0; e < 3; ++e) { @@ -586,19 +746,30 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic unsigned char k0 = vertex_kind[i0]; unsigned char k1 = vertex_kind[i1]; - // check that i0 and i1 are border/seam and are on the same edge loop - // loop[] tracks half edges so we only need to check i0->i1 - if (k0 != k1 || (k0 != Kind_Border && k0 != Kind_Seam) || loop[i0] != i1) + // check that either i0 or i1 are border/seam and are on the same edge loop + // note that we need to add the error even for edged that connect e.g. border & locked + // if we don't do that, the adjacent border->border edge won't have correct errors for corners + if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam) continue; - unsigned int i2 = indices[i + next[next[e]]]; + if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) + continue; + + if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0) + continue; + + // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges + if (kHasOpposite[k0][k1] && remap[i1] > remap[i0]) + continue; + + unsigned int i2 = indices[i + next[e + 1]]; // we try hard to maintain border edge geometry; seam edges can move more freely // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical const float kEdgeWeightSeam = 1.f; const float kEdgeWeightBorder = 10.f; - float edgeWeight = (k0 == Kind_Seam) ? kEdgeWeightSeam : kEdgeWeightBorder; + float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam; Quadric Q; quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); @@ -609,7 +780,89 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic } } -static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop) +static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap) +{ + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int i0 = indices[i + 0]; + unsigned int i1 = indices[i + 1]; + unsigned int i2 = indices[i + 2]; + + Quadric QA; + QuadricGrad G[kMaxAttributes]; + quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count); + + // TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea + quadricAdd(attribute_quadrics[remap[i0]], QA); + quadricAdd(attribute_quadrics[remap[i1]], QA); + quadricAdd(attribute_quadrics[remap[i2]], QA); + + quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count); + quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count); + quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count); + } +} + +// does triangle ABC flip when C is replaced with D? +static bool hasTriangleFlip(const Vector3& a, const Vector3& b, const Vector3& c, const Vector3& d) +{ + Vector3 eb = {b.x - a.x, b.y - a.y, b.z - a.z}; + Vector3 ec = {c.x - a.x, c.y - a.y, c.z - a.z}; + Vector3 ed = {d.x - a.x, d.y - a.y, d.z - a.z}; + + Vector3 nbc = {eb.y * ec.z - eb.z * ec.y, eb.z * ec.x - eb.x * ec.z, eb.x * ec.y - eb.y * ec.x}; + Vector3 nbd = {eb.y * ed.z - eb.z * ed.y, eb.z * ed.x - eb.x * ed.z, eb.x * ed.y - eb.y * ed.x}; + + return nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z <= 0; +} + +static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, const unsigned int* collapse_remap, unsigned int i0, unsigned int i1) +{ + assert(collapse_remap[i0] == i0); + assert(collapse_remap[i1] == i1); + + const Vector3& v0 = vertex_positions[i0]; + const Vector3& v1 = vertex_positions[i1]; + + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]]; + size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0]; + + for (size_t i = 0; i < count; ++i) + { + unsigned int a = collapse_remap[edges[i].next]; + unsigned int b = collapse_remap[edges[i].prev]; + + // skip triangles that will get collapsed by i0->i1 collapse or already got collapsed previously + if (a == i1 || b == i1 || a == b) + continue; + + // early-out when at least one triangle flips due to a collapse + if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1)) + return true; + } + + return false; +} + +static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind) +{ + size_t dual_count = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned char k = vertex_kind[i]; + unsigned int e = adjacency.offsets[i + 1] - adjacency.offsets[i]; + + dual_count += (k == Kind_Manifold || k == Kind_Seam) ? e : 0; + } + + assert(dual_count <= index_count); + + // pad capacity by 3 so that we can check for overflow once per triangle instead of once per edge + return (index_count - dual_count / 2) + 3; +} + +static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop) { size_t collapse_count = 0; @@ -617,6 +870,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices { static const int next[3] = {1, 2, 0}; + // this should never happen as boundEdgeCollapses should give an upper bound for the collapse count, but in an unlikely event it does we can just drop extra collapses + if (collapse_count + 3 > collapse_capacity) + break; + for (int e = 0; e < 3; ++e) { unsigned int i0 = indices[i + e]; @@ -667,7 +924,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices return collapse_count; } -static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap) +static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap) { for (size_t i = 0; i < collapse_count; ++i) { @@ -681,11 +938,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const unsigned int j0 = c.bidi ? i1 : i0; unsigned int j1 = c.bidi ? i0 : i1; - const Quadric& qi = vertex_quadrics[remap[i0]]; - const Quadric& qj = vertex_quadrics[remap[j0]]; + float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]); + float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]); - float ei = quadricError(qi, vertex_positions[i1]); - float ej = quadricError(qj, vertex_positions[j1]); + if (attribute_count) + { + ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); + ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]); + } // pick edge direction with minimal error c.v0 = ei <= ej ? i0 : j0; @@ -694,61 +954,6 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const } } -#if TRACE > 1 -static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind) -{ - size_t ckinds[Kind_Count][Kind_Count] = {}; - float cerrors[Kind_Count][Kind_Count] = {}; - - for (int k0 = 0; k0 < Kind_Count; ++k0) - for (int k1 = 0; k1 < Kind_Count; ++k1) - cerrors[k0][k1] = FLT_MAX; - - for (size_t i = 0; i < collapse_count; ++i) - { - unsigned int i0 = collapses[i].v0; - unsigned int i1 = collapses[i].v1; - - unsigned char k0 = vertex_kind[i0]; - unsigned char k1 = vertex_kind[i1]; - - ckinds[k0][k1]++; - cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1]; - } - - for (int k0 = 0; k0 < Kind_Count; ++k0) - for (int k1 = 0; k1 < Kind_Count; ++k1) - if (ckinds[k0][k1]) - printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]); -} - -static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind) -{ - size_t locked_collapses[Kind_Count][Kind_Count] = {}; - - for (size_t i = 0; i < index_count; i += 3) - { - static const int next[3] = {1, 2, 0}; - - for (int e = 0; e < 3; ++e) - { - unsigned int i0 = indices[i + e]; - unsigned int i1 = indices[i + next[e]]; - - unsigned char k0 = vertex_kind[i0]; - unsigned char k1 = vertex_kind[i1]; - - locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0]; - } - } - - for (int k0 = 0; k0 < Kind_Count; ++k0) - for (int k1 = 0; k1 < Kind_Count; ++k1) - if (locked_collapses[k0][k1]) - printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1])); -} -#endif - static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count) { const int sort_bits = 11; @@ -787,22 +992,38 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse } } -static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit) +static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error) { size_t edge_collapses = 0; size_t triangle_collapses = 0; + // most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit + // note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses + size_t edge_collapse_goal = triangle_collapse_goal / 2; + +#if TRACE + size_t stats[4] = {}; +#endif + for (size_t i = 0; i < collapse_count; ++i) { const Collapse& c = collapses[collapse_order[i]]; + TRACESTATS(0); + if (c.error > error_limit) break; - if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10) + if (triangle_collapses >= triangle_collapse_goal) break; - if (triangle_collapses >= triangle_collapse_goal) + // we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked + // as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor + float error_goal = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX; + + // on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd + // topology, we only abort if we got over 1/6 collapses accordingly. + if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 6) break; unsigned int i0 = c.v0; @@ -815,13 +1036,31 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* // it's important to not move the vertices twice since it complicates the tracking/remapping logic // it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass if (collapse_locked[r0] | collapse_locked[r1]) + { + TRACESTATS(1); continue; + } + + if (hasTriangleFlips(adjacency, vertex_positions, collapse_remap, r0, r1)) + { + // adjust collapse goal since this collapse is invalid and shouldn't factor into error goal + edge_collapse_goal++; + + TRACESTATS(2); + continue; + } assert(collapse_remap[r0] == r0); assert(collapse_remap[r1] == r1); quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); + if (attribute_count) + { + quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]); + quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count); + } + if (vertex_kind[i0] == Kind_Complex) { unsigned int v = i0; @@ -857,8 +1096,18 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* // border edges collapse 1 triangle, other edges collapse 2 or more triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2; edge_collapses++; + + result_error = result_error < c.error ? c.error : result_error; } +#if TRACE + float error_goal_perfect = edge_collapse_goal < collapse_count ? collapses[collapse_order[edge_collapse_goal]].error : 0.f; + + printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d)\n", + int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_perfect), + int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2])); +#endif + return edge_collapses; } @@ -946,7 +1195,7 @@ struct IdHasher struct TriangleHasher { - unsigned int* indices; + const unsigned int* indices; size_t hash(unsigned int i) const { @@ -1074,17 +1323,41 @@ static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices } } -static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells) +static void fillCellReservoirs(Reservoir* cell_reservoirs, size_t cell_count, const Vector3* vertex_positions, const float* vertex_colors, size_t vertex_colors_stride, size_t vertex_count, const unsigned int* vertex_cells) { + static const float dummy_color[] = { 0.f, 0.f, 0.f }; + + size_t vertex_colors_stride_float = vertex_colors_stride / sizeof(float); + for (size_t i = 0; i < vertex_count; ++i) { - unsigned int c = vertex_cells[i]; + unsigned int cell = vertex_cells[i]; const Vector3& v = vertex_positions[i]; + Reservoir& r = cell_reservoirs[cell]; - Quadric Q; - quadricFromPoint(Q, v.x, v.y, v.z, 1.f); + const float* color = vertex_colors ? &vertex_colors[i * vertex_colors_stride_float] : dummy_color; - quadricAdd(cell_quadrics[c], Q); + r.x += v.x; + r.y += v.y; + r.z += v.z; + r.r += color[0]; + r.g += color[1]; + r.b += color[2]; + r.w += 1.f; + } + + for (size_t i = 0; i < cell_count; ++i) + { + Reservoir& r = cell_reservoirs[i]; + + float iw = r.w == 0.f ? 0.f : 1.f / r.w; + + r.x *= iw; + r.y *= iw; + r.z *= iw; + r.r *= iw; + r.g *= iw; + r.b *= iw; } } @@ -1105,6 +1378,34 @@ static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t c } } +static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Reservoir* cell_reservoirs, const Vector3* vertex_positions, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t vertex_count) +{ + static const float dummy_color[] = { 0.f, 0.f, 0.f }; + + size_t vertex_colors_stride_float = vertex_colors_stride / sizeof(float); + + memset(cell_remap, -1, cell_count * sizeof(unsigned int)); + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int cell = vertex_cells[i]; + const Vector3& v = vertex_positions[i]; + const Reservoir& r = cell_reservoirs[cell]; + + const float* color = vertex_colors ? &vertex_colors[i * vertex_colors_stride_float] : dummy_color; + + float pos_error = (v.x - r.x) * (v.x - r.x) + (v.y - r.y) * (v.y - r.y) + (v.z - r.z) * (v.z - r.z); + float col_error = (color[0] - r.r) * (color[0] - r.r) + (color[1] - r.g) * (color[1] - r.g) + (color[2] - r.b) * (color[2] - r.b); + float error = pos_error + color_weight * col_error; + + if (cell_remap[cell] == ~0u || cell_errors[cell] > error) + { + cell_remap[cell] = unsigned(i); + cell_errors[cell] = error; + } + } +} + static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap) { TriangleHasher hasher = {destination}; @@ -1160,19 +1461,25 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float } // namespace meshopt -#if TRACE -unsigned char* meshopt_simplifyDebugKind = 0; -unsigned int* meshopt_simplifyDebugLoop = 0; +#ifndef NDEBUG +// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds +MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = NULL; +MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = NULL; +MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = NULL; #endif -size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) +size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(target_index_count <= index_count); + assert((options & ~(meshopt_SimplifyLockBorder)) == 0); + assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256); + assert(vertex_attributes_stride % sizeof(float) == 0); + assert(attribute_count <= kMaxAttributes); meshopt_Allocator allocator; @@ -1180,7 +1487,8 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, // build adjacency information EdgeAdjacency adjacency = {}; - buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator); + prepareEdgeAdjacency(adjacency, index_count, vertex_count, allocator); + updateEdgeAdjacency(adjacency, indices, index_count, vertex_count, NULL); // build position remap that maps each vertex to the one with identical position unsigned int* remap = allocator.allocate(vertex_count); @@ -1190,7 +1498,8 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, // classify vertices; vertex kind determines collapse rules, see kCanCollapse unsigned char* vertex_kind = allocator.allocate(vertex_count); unsigned int* loop = allocator.allocate(vertex_count); - classifyVertices(vertex_kind, loop, vertex_count, adjacency, remap, wedge); + unsigned int* loopback = allocator.allocate(vertex_count); + classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge, options); #if TRACE size_t unique_positions = 0; @@ -1204,132 +1513,147 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, kinds[vertex_kind[i]] += remap[i] == i; printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n", - int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked])); + int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked])); #endif Vector3* vertex_positions = allocator.allocate(vertex_count); rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride); + float* vertex_attributes = NULL; + + if (attribute_count) + { + vertex_attributes = allocator.allocate(vertex_count * attribute_count); + rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count); + } + Quadric* vertex_quadrics = allocator.allocate(vertex_count); memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric)); + Quadric* attribute_quadrics = NULL; + QuadricGrad* attribute_gradients = NULL; + + if (attribute_count) + { + attribute_quadrics = allocator.allocate(vertex_count); + memset(attribute_quadrics, 0, vertex_count * sizeof(Quadric)); + + attribute_gradients = allocator.allocate(vertex_count * attribute_count); + memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad)); + } + fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap); - fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop); + fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback); + + if (attribute_count) + fillAttributeQuadrics(attribute_quadrics, attribute_gradients, indices, index_count, vertex_positions, vertex_attributes, attribute_count, remap); if (result != indices) memcpy(result, indices, index_count * sizeof(unsigned int)); #if TRACE size_t pass_count = 0; - float worst_error = 0; #endif - Collapse* edge_collapses = allocator.allocate(index_count); - unsigned int* collapse_order = allocator.allocate(index_count); + size_t collapse_capacity = boundEdgeCollapses(adjacency, vertex_count, index_count, vertex_kind); + + Collapse* edge_collapses = allocator.allocate(collapse_capacity); + unsigned int* collapse_order = allocator.allocate(collapse_capacity); unsigned int* collapse_remap = allocator.allocate(vertex_count); unsigned char* collapse_locked = allocator.allocate(vertex_count); size_t result_count = index_count; + float result_error = 0; // target_error input is linear; we need to adjust it to match quadricError units float error_limit = target_error * target_error; while (result_count > target_index_count) { - size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop); + // note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress + updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); + + size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop); + assert(edge_collapse_count <= collapse_capacity); // no edges can be collapsed any more due to topology restrictions if (edge_collapse_count == 0) break; - rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap); - -#if TRACE > 1 - dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind); -#endif + rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap); sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count); - // most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit - // note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses size_t triangle_collapse_goal = (result_count - target_index_count) / 3; - size_t edge_collapse_goal = triangle_collapse_goal / 2; - - // we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked - // as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor - const float kPassErrorBound = 1.5f; - - float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX; for (size_t i = 0; i < vertex_count; ++i) collapse_remap[i] = unsigned(i); memset(collapse_locked, 0, vertex_count); - size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit); +#if TRACE + printf("pass %d: ", int(pass_count++)); +#endif + + size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error); // no edges can be collapsed any more due to hitting the error limit or triangle collapse limit if (collapses == 0) break; remapEdgeLoops(loop, vertex_count, collapse_remap); + remapEdgeLoops(loopback, vertex_count, collapse_remap); size_t new_count = remapIndexBuffer(result, result_count, collapse_remap); assert(new_count < result_count); -#if TRACE - float pass_error = 0.f; - for (size_t i = 0; i < edge_collapse_count; ++i) - { - Collapse& c = edge_collapses[collapse_order[i]]; - - if (collapse_remap[c.v0] == c.v1) - pass_error = c.error; - } - - pass_count++; - worst_error = (worst_error < pass_error) ? pass_error : worst_error; - - printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal); -#endif - result_count = new_count; } #if TRACE - printf("passes: %d, worst error: %e\n", int(pass_count), worst_error); + printf("result: %d triangles, error: %e; total %d passes\n", int(result_count), sqrtf(result_error), int(pass_count)); #endif -#if TRACE > 1 - dumpLockedCollapses(result, result_count, vertex_kind); -#endif - -#if TRACE +#ifndef NDEBUG if (meshopt_simplifyDebugKind) memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count); if (meshopt_simplifyDebugLoop) memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int)); + + if (meshopt_simplifyDebugLoopBack) + memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int)); #endif + // result_error is quadratic; we need to remap it back to linear + if (out_result_error) + *out_result_error = sqrtf(result_error); + return result_count; } -size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count) +size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) +{ + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, target_index_count, target_error, options, out_result_error); +} + +size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) +{ + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, out_result_error); +} + +size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error) { using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(target_index_count <= index_count); // we expect to get ~2 triangles/vertex in the output size_t target_cell_count = target_index_count / 6; - if (target_cell_count == 0) - return 0; - meshopt_Allocator allocator; Vector3* vertex_positions = allocator.allocate(vertex_count); @@ -1346,18 +1670,25 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind const int kInterpolationPasses = 5; // invariant: # of triangles in min_grid <= target_count - int min_grid = 0; + int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error)); int max_grid = 1025; size_t min_triangles = 0; size_t max_triangles = index_count / 3; + // when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid + if (min_grid > 1) + { + computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + min_triangles = countTriangles(vertex_ids, indices, index_count); + } + // instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size... int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f); for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass) { - assert(min_triangles < target_index_count / 3); - assert(max_grid - min_grid > 1); + if (min_triangles >= target_index_count / 3 || max_grid - min_grid <= 1) + break; // we clamp the prediction of the grid size to make sure that the search converges int grid_size = next_grid_size; @@ -1368,9 +1699,9 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind #if TRACE printf("pass %d (%s): grid size %d, triangles %d, %s\n", - pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", - grid_size, int(triangles), - (triangles <= target_index_count / 3) ? "under" : "over"); + pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", + grid_size, int(triangles), + (triangles <= target_index_count / 3) ? "under" : "over"); #endif float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles)); @@ -1386,16 +1717,18 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind max_triangles = triangles; } - if (triangles == target_index_count / 3 || max_grid - min_grid <= 1) - break; - // we start by using interpolation search - it usually converges faster // however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN) next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2; } if (min_triangles == 0) + { + if (out_result_error) + *out_result_error = 1.f; + return 0; + } // build vertex->cell association by mapping all vertices with the same quantized position to the same cell size_t table_size = hashBuckets2(vertex_count); @@ -1418,27 +1751,38 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count); + // compute error + float result_error = 0.f; + + for (size_t i = 0; i < cell_count; ++i) + result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error; + // collapse triangles! // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :( size_t tritable_size = hashBuckets2(min_triangles); unsigned int* tritable = allocator.allocate(tritable_size); size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap); - assert(write <= target_index_count); #if TRACE - printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles)); + printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error)); #endif + if (out_result_error) + *out_result_error = sqrtf(result_error); + return write; } -size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count) +size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count) { using namespace meshopt; - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); + assert(vertex_colors_stride == 0 || (vertex_colors_stride >= 12 && vertex_colors_stride <= 256)); + assert(vertex_colors_stride % sizeof(float) == 0); + assert(vertex_colors == NULL || vertex_colors_stride != 0); assert(target_vertex_count <= vertex_count); size_t target_cell_count = target_vertex_count; @@ -1487,9 +1831,9 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos #if TRACE printf("pass %d (%s): grid size %d, vertices %d, %s\n", - pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", - grid_size, int(vertices), - (vertices <= target_vertex_count) ? "under" : "over"); + pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", + grid_size, int(vertices), + (vertices <= target_vertex_count) ? "under" : "over"); #endif float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices)); @@ -1522,25 +1866,43 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); - // build a quadric for each target cell - Quadric* cell_quadrics = allocator.allocate(cell_count); - memset(cell_quadrics, 0, cell_count * sizeof(Quadric)); + // accumulate points into a reservoir for each target cell + Reservoir* cell_reservoirs = allocator.allocate(cell_count); + memset(cell_reservoirs, 0, cell_count * sizeof(Reservoir)); - fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells); + fillCellReservoirs(cell_reservoirs, cell_count, vertex_positions, vertex_colors, vertex_colors_stride, vertex_count, vertex_cells); // for each target cell, find the vertex with the minimal error unsigned int* cell_remap = allocator.allocate(cell_count); float* cell_errors = allocator.allocate(cell_count); - fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count); + fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight * color_weight, vertex_count); // copy results to the output assert(cell_count <= target_vertex_count); memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count); #if TRACE - printf("result: %d cells\n", int(cell_count)); + // compute error + float result_error = 0.f; + + for (size_t i = 0; i < cell_count; ++i) + result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error; + + printf("result: %d cells, %e error\n", int(cell_count), sqrtf(result_error)); #endif return cell_count; } + +float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride); + + return extent; +} diff --git a/Source/ThirdParty/meshoptimizer/spatialorder.cpp b/Source/ThirdParty/meshoptimizer/spatialorder.cpp index b09f80ac6..7b1a06945 100644 --- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp +++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp @@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos { using namespace meshopt; - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; @@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); (void)vertex_count; diff --git a/Source/ThirdParty/meshoptimizer/vcacheoptimizer.cpp b/Source/ThirdParty/meshoptimizer/vcacheoptimizer.cpp index fb8ade4b7..d4b08ba34 100644 --- a/Source/ThirdParty/meshoptimizer/vcacheoptimizer.cpp +++ b/Source/ThirdParty/meshoptimizer/vcacheoptimizer.cpp @@ -110,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned return ~0u; } -static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) +static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) { unsigned int best_candidate = ~0u; int best_priority = -1; @@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c]; } - unsigned int cache_holder[2 * (kCacheSizeMax + 3)]; + unsigned int cache_holder[2 * (kCacheSizeMax + 4)]; unsigned int* cache = cache_holder; - unsigned int* cache_new = cache_holder + kCacheSizeMax + 3; + unsigned int* cache_new = cache_holder + kCacheSizeMax + 4; size_t cache_count = 0; unsigned int current_triangle = 0; @@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned { unsigned int index = cache[i]; - if (index != a && index != b && index != c) - { - cache_new[cache_write++] = index; - } + cache_new[cache_write] = index; + cache_write += (index != a && index != b && index != c); } unsigned int* cache_temp = cache; @@ -281,16 +279,16 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned { unsigned int index = indices[current_triangle * 3 + k]; - unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbours_size = adjacency.counts[index]; + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; - for (size_t i = 0; i < neighbours_size; ++i) + for (size_t i = 0; i < neighbors_size; ++i) { - unsigned int tri = neighbours[i]; + unsigned int tri = neighbors[i]; if (tri == current_triangle) { - neighbours[i] = neighbours[neighbours_size - 1]; + neighbors[i] = neighbors[neighbors_size - 1]; adjacency.counts[index]--; break; } @@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned { unsigned int index = cache[i]; + // no need to update scores if we are never going to use this vertex + if (adjacency.counts[index] == 0) + continue; + int cache_position = i >= cache_size ? -1 : int(i); // update vertex score @@ -314,10 +316,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned vertex_scores[index] = score; // update scores of vertex triangles - const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index]; - const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index]; + const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index]; + const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index]; - for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) { unsigned int tri = *it; assert(!emitted_flags[tri]); @@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned float tri_score = triangle_scores[tri] + score_diff; assert(tri_score > 0); - if (best_score < tri_score) - { - best_triangle = tri; - best_score = tri_score; - } + best_triangle = best_score < tri_score ? tri : best_triangle; + best_score = best_score < tri_score ? tri_score : best_score; triangle_scores[tri] = tri_score; } @@ -412,11 +411,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i { const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top; - // emit all vertex neighbours - const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; - const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex]; + // emit all vertex neighbors + const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; + const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex]; - for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) { unsigned int triangle = *it; @@ -461,7 +460,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top; // get next vertex - current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); + current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); if (current_vertex == ~0u) { diff --git a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp index 30fbcd454..8ab0662d8 100644 --- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp +++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp @@ -42,16 +42,24 @@ #endif // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD -// Note that we need unimplemented-simd128 subset for a few functions that are implemented de-facto #if defined(__wasm_simd128__) #define SIMD_WASM -#define SIMD_TARGET __attribute__((target("unimplemented-simd128"))) +// Prevent compiling other variant when wasm simd compilation is active +#undef SIMD_NEON +#undef SIMD_SSE +#undef SIMD_AVX #endif #ifndef SIMD_TARGET #define SIMD_TARGET #endif +// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap +// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) +#define SIMD_LATENCYOPT +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -82,31 +90,14 @@ #include #endif -#ifndef TRACE -#define TRACE 0 -#endif - -#if TRACE -#include -#endif - #ifdef SIMD_WASM -#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i) -#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) -#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) -#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11) -#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15) -#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2) -#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3) -#endif - -#if defined(SIMD_WASM) -// v128_t wasm_v8x16_swizzle(v128_t a, v128_t b) -SIMD_TARGET -static __inline__ v128_t wasm_v8x16_swizzle(v128_t a, v128_t b) -{ - return (v128_t)__builtin_wasm_swizzle_v8x16((__i8x16)a, (__i8x16)b); -} +#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i) +#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) +#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) +#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11) +#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15) +#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2) +#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3) #endif namespace meshopt @@ -144,19 +135,6 @@ inline unsigned char unzigzag8(unsigned char v) return -(v & 1) ^ (v >> 1); } -#if TRACE -struct Stats -{ - size_t size; - size_t header; - size_t bitg[4]; - size_t bitb[4]; -}; - -Stats* bytestats; -Stats vertexstats[256]; -#endif - static bool encodeBytesGroupZero(const unsigned char* buffer) { for (size_t i = 0; i < kByteGroupSize; ++i) @@ -242,7 +220,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; if (size_t(data_end - data) < header_size) - return 0; + return NULL; data += header_size; @@ -251,7 +229,7 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, for (size_t i = 0; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) - return 0; + return NULL; int best_bits = 8; size_t best_size = encodeBytesGroupMeasure(buffer + i, 8); @@ -278,17 +256,8 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, assert(data + best_size == next); data = next; - -#if TRACE > 1 - bytestats->bitg[bitslog2]++; - bytestats->bitb[bitslog2] += best_size; -#endif } -#if TRACE > 1 - bytestats->header += header_size; -#endif - return data; } @@ -317,19 +286,9 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data vertex_offset += vertex_size; } -#if TRACE - const unsigned char* olddata = data; - bytestats = &vertexstats[k]; -#endif - data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1)); if (!data) - return 0; - -#if TRACE - bytestats = 0; - vertexstats[k].size += data - olddata; -#endif + return NULL; } memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); @@ -337,7 +296,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data return data; } -#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX)) +#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM)) static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) { #define READ() byte = *data++ @@ -397,14 +356,14 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; if (size_t(data_end - data) < header_size) - return 0; + return NULL; data += header_size; for (size_t i = 0; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) - return 0; + return NULL; size_t header_offset = i / kByteGroupSize; @@ -429,7 +388,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u { data = decodeBytes(data, data_end, buffer, vertex_count_aligned); if (!data) - return 0; + return NULL; size_t vertex_offset = k; @@ -458,7 +417,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u static unsigned char kDecodeBytesGroupShuffle[256][8]; static unsigned char kDecodeBytesGroupCount[256]; -#ifdef EMSCRIPTEN +#ifdef __wasm__ __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop! #endif static bool @@ -521,6 +480,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi typedef int unaligned_int; #endif +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 4)); @@ -539,11 +510,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel4 = _mm_loadl_epi64(reinterpret_cast(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 8)); @@ -561,7 +546,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -653,24 +642,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { - static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; + // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 + const uint64_t magic = 0x000103070f1f3f80ull; - uint8x16_t byte_mask = vld1q_u8(byte_mask_data); - uint8x16_t masked = vandq_u8(mask, byte_mask); + uint64x2_t mask2 = vreinterpretq_u64_u8(mask); -#ifdef __aarch64__ - // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc - mask0 = vaddv_u8(vget_low_u8(masked)); - mask1 = vaddv_u8(vget_high_u8(masked)); -#else - // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8) - uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked)); - uint8x8_t sum2 = vpadd_u8(sum1, sum1); - uint8x8_t sum3 = vpadd_u8(sum2, sum2); - - mask0 = vget_lane_u8(sum3, 0); - mask1 = vget_lane_u8(sum3, 1); -#endif + mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56); + mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) @@ -688,6 +666,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 1: { +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel2 = vld1_u8(data); uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0]; uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22); @@ -704,11 +694,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel4 = vld1_u8(data); uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15))); uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]); @@ -724,7 +728,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -747,13 +755,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi SIMD_TARGET static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) { - // TODO: 8b buffer overrun - should we use splat or extend buffers? v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); - // TODO: we should use v8x16_load_splat v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); - sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); v128_t sm1r = wasm_i8x16_add(sm1, sm1off); @@ -763,26 +769,16 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) SIMD_TARGET static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) { - v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3); + // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 + const uint64_t magic = 0x000103070f1f3f80ull; - // TODO: when Chrome supports v128.const we can try doing vectorized and? - uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull; - uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull; - - uint64_t mask_2 = mask_1a | mask_1b; - uint64_t mask_4 = mask_2 | (mask_2 >> 16); - uint64_t mask_8 = mask_4 | (mask_4 >> 8); - - mask0 = uint8_t(mask_8); - mask1 = uint8_t(mask_8 >> 32); + mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56); + mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56); } SIMD_TARGET static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) { - unsigned char byte, enc, encv; - const unsigned char* data_var; - switch (bitslog2) { case 0: @@ -796,7 +792,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 1: { - // TODO: test 4b load splat v128_t sel2 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 4); @@ -811,8 +806,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi v128_t shuf = decodeShuffleMask(mask0, mask1); - // TODO: test or/andnot - v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -821,7 +815,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 2: { - // TODO: test 8b load splat v128_t sel4 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 8); @@ -835,8 +828,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi v128_t shuf = decodeShuffleMask(mask0, mask1); - // TODO: test or/andnot - v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -927,8 +919,7 @@ SIMD_TARGET static v128_t unzigzag8(v128_t v) { v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); - // TODO: use wasm_u8x16_shr when v8 fixes codegen for constant shifts - v128_t xr = wasm_v128_and(wasm_u16x8_shr(v, 1), wasm_i8x16_splat(127)); + v128_t xr = wasm_u8x16_shr(v, 1); return wasm_v128_xor(xl, xr); } @@ -947,7 +938,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; if (size_t(data_end - data) < header_size) - return 0; + return NULL; data += header_size; @@ -969,7 +960,7 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns for (; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) - return 0; + return NULL; size_t header_offset = i / kByteGroupSize; @@ -997,7 +988,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con { data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned); if (!data) - return 0; + return NULL; } #if defined(SIMD_SSE) || defined(SIMD_AVX) @@ -1020,7 +1011,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con #ifdef SIMD_WASM #define TEMP v128_t -#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun +#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) #define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) @@ -1092,7 +1083,7 @@ static unsigned int getCpuFeatures() return cpuinfo[2]; } -unsigned int cpuid = getCpuFeatures(); +static unsigned int cpuid = getCpuFeatures(); #endif } // namespace meshopt @@ -1104,10 +1095,6 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); -#if TRACE - memset(vertexstats, 0, sizeof(vertexstats)); -#endif - const unsigned char* vertex_data = static_cast(vertices); unsigned char* data = buffer; @@ -1160,28 +1147,6 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con assert(data >= buffer + tail_size); assert(data <= buffer + buffer_size); -#if TRACE - size_t total_size = data - buffer; - - for (size_t k = 0; k < vertex_size; ++k) - { - const Stats& vsk = vertexstats[k]; - - printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); - -#if TRACE > 1 - printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)", - int(vsk.header), - int(vsk.bitg[0]), int(vsk.bitb[0]), - int(vsk.bitg[1]), int(vsk.bitb[1]), - int(vsk.bitg[2]), int(vsk.bitb[2]), - int(vsk.bitg[3]), int(vsk.bitb[3])); -#endif - - printf("\n"); - } -#endif - return data - buffer; } @@ -1217,7 +1182,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); - const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0; + const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL; #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; diff --git a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp index e07d11a7d..4b5f444f0 100644 --- a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp +++ b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp @@ -2,6 +2,7 @@ #include "meshoptimizer.h" #include +#include // The block below auto-detects SIMD ISA that can be used on the target platform #ifndef MESHOPTIMIZER_NO_SIMD @@ -29,6 +30,9 @@ // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD #if defined(__wasm_simd128__) #define SIMD_WASM +// Prevent compiling other variant when wasm simd compilation is active +#undef SIMD_NEON +#undef SIMD_SSE #endif #endif // !MESHOPTIMIZER_NO_SIMD @@ -51,6 +55,7 @@ #endif #ifdef SIMD_WASM +#undef __DEPRECATED #include #endif @@ -61,6 +66,10 @@ #define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7) #endif +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + namespace meshopt { @@ -143,7 +152,8 @@ static void decodeFilterExp(unsigned int* data, size_t count) int m = int(v << 8) >> 8; int e = int(v) >> 24; - union { + union + { float f; unsigned int ui; } u; @@ -158,11 +168,31 @@ static void decodeFilterExp(unsigned int* data, size_t count) #endif #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) +template +static void dispatchSimd(void (*process)(T*, size_t), T* data, size_t count, size_t stride) +{ + assert(stride <= 4); + + size_t count4 = count & ~size_t(3); + process(data, count4); + + if (count4 < count) + { + T tail[4 * 4] = {}; // max stride 4, max count 4 + size_t tail_size = (count - count4) * stride * sizeof(T); + assert(tail_size <= sizeof(tail)); + + memcpy(tail, data + count4 * stride, tail_size); + process(tail, count - count4); + memcpy(data + count4 * stride, tail, tail_size); + } +} + inline uint64_t rotateleft64(uint64_t v, int x) { #if defined(_MSC_VER) && !defined(__clang__) return _rotl64(v, x); -#elif defined(__clang__) && __clang_major__ >= 8 +#elif defined(__clang__) && __has_builtin(__builtin_rotateleft64) return __builtin_rotateleft64(v, x); #else return (v << (x & 63)) | (v >> ((64 - x) & 63)); @@ -620,7 +650,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count) static void decodeFilterOctSimd(short* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); - volatile v128_t zmask = wasm_i32x4_splat(0x7fff); // TODO: volatile works around LLVM shuffle "optimizations" + const v128_t zmask = wasm_i32x4_splat(0x7fff); for (size_t i = 0; i < count; i += 4) { @@ -732,7 +762,8 @@ static void decodeFilterQuatSimd(short* data, size_t count) v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr); // compute component index shifted left by 4 (and moved into i32x4 slot) - v128_t cm = wasm_i32x4_shl(cf, 4); + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449 + volatile v128_t cm = wasm_i32x4_shl(cf, 4); // rotate and store uint64_t* out = reinterpret_cast(&data[i * 4]); @@ -765,57 +796,238 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) } #endif +// optimized variant of frexp +inline int optlog2(float v) +{ + union + { + float f; + unsigned int ui; + } u; + + u.f = v; + // +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code + return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1; +} + +// optimized variant of ldexp +inline float optexp2(int e) +{ + union + { + float f; + unsigned int ui; + } u; + + u.ui = unsigned(e + 127) << 23; + return u.f; +} + } // namespace meshopt -void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size) +void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride) { using namespace meshopt; - assert(vertex_count % 4 == 0); - assert(vertex_size == 4 || vertex_size == 8); + assert(stride == 4 || stride == 8); #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) - if (vertex_size == 4) - decodeFilterOctSimd(static_cast(buffer), vertex_count); + if (stride == 4) + dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); else - decodeFilterOctSimd(static_cast(buffer), vertex_count); + dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); #else - if (vertex_size == 4) - decodeFilterOct(static_cast(buffer), vertex_count); + if (stride == 4) + decodeFilterOct(static_cast(buffer), count); else - decodeFilterOct(static_cast(buffer), vertex_count); + decodeFilterOct(static_cast(buffer), count); #endif } -void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size) +void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride) { using namespace meshopt; - assert(vertex_count % 4 == 0); - assert(vertex_size == 8); - (void)vertex_size; + assert(stride == 8); + (void)stride; #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) - decodeFilterQuatSimd(static_cast(buffer), vertex_count); + dispatchSimd(decodeFilterQuatSimd, static_cast(buffer), count, 4); #else - decodeFilterQuat(static_cast(buffer), vertex_count); + decodeFilterQuat(static_cast(buffer), count); #endif } -void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size) +void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride) { using namespace meshopt; - assert(vertex_count % 4 == 0); - assert(vertex_size % 4 == 0); + assert(stride > 0 && stride % 4 == 0); #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) - decodeFilterExpSimd(static_cast(buffer), vertex_count * (vertex_size / 4)); + dispatchSimd(decodeFilterExpSimd, static_cast(buffer), count * (stride / 4), 1); #else - decodeFilterExp(static_cast(buffer), vertex_count * (vertex_size / 4)); + decodeFilterExp(static_cast(buffer), count * (stride / 4)); #endif } +void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data) +{ + assert(stride == 4 || stride == 8); + assert(bits >= 1 && bits <= 16); + + signed char* d8 = static_cast(destination); + short* d16 = static_cast(destination); + + int bytebits = int(stride * 2); + + for (size_t i = 0; i < count; ++i) + { + const float* n = &data[i * 4]; + + // octahedral encoding of a unit vector + float nx = n[0], ny = n[1], nz = n[2], nw = n[3]; + float nl = fabsf(nx) + fabsf(ny) + fabsf(nz); + float ns = nl == 0.f ? 0.f : 1.f / nl; + + nx *= ns; + ny *= ns; + + float u = (nz >= 0.f) ? nx : (1 - fabsf(ny)) * (nx >= 0.f ? 1.f : -1.f); + float v = (nz >= 0.f) ? ny : (1 - fabsf(nx)) * (ny >= 0.f ? 1.f : -1.f); + + int fu = meshopt_quantizeSnorm(u, bits); + int fv = meshopt_quantizeSnorm(v, bits); + int fo = meshopt_quantizeSnorm(1.f, bits); + int fw = meshopt_quantizeSnorm(nw, bytebits); + + if (stride == 4) + { + d8[i * 4 + 0] = (signed char)(fu); + d8[i * 4 + 1] = (signed char)(fv); + d8[i * 4 + 2] = (signed char)(fo); + d8[i * 4 + 3] = (signed char)(fw); + } + else + { + d16[i * 4 + 0] = short(fu); + d16[i * 4 + 1] = short(fv); + d16[i * 4 + 2] = short(fo); + d16[i * 4 + 3] = short(fw); + } + } +} + +void meshopt_encodeFilterQuat(void* destination_, size_t count, size_t stride, int bits, const float* data) +{ + assert(stride == 8); + assert(bits >= 4 && bits <= 16); + (void)stride; + + short* destination = static_cast(destination_); + + const float scaler = sqrtf(2.f); + + for (size_t i = 0; i < count; ++i) + { + const float* q = &data[i * 4]; + short* d = &destination[i * 4]; + + // establish maximum quaternion component + int qc = 0; + qc = fabsf(q[1]) > fabsf(q[qc]) ? 1 : qc; + qc = fabsf(q[2]) > fabsf(q[qc]) ? 2 : qc; + qc = fabsf(q[3]) > fabsf(q[qc]) ? 3 : qc; + + // we use double-cover properties to discard the sign + float sign = q[qc] < 0.f ? -1.f : 1.f; + + // note: we always encode a cyclical swizzle to be able to recover the order via rotation + d[0] = short(meshopt_quantizeSnorm(q[(qc + 1) & 3] * scaler * sign, bits)); + d[1] = short(meshopt_quantizeSnorm(q[(qc + 2) & 3] * scaler * sign, bits)); + d[2] = short(meshopt_quantizeSnorm(q[(qc + 3) & 3] * scaler * sign, bits)); + d[3] = short((meshopt_quantizeSnorm(1.f, bits) & ~3) | qc); + } +} + +void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode) +{ + using namespace meshopt; + + assert(stride > 0 && stride % 4 == 0 && stride <= 256); + assert(bits >= 1 && bits <= 24); + + unsigned int* destination = static_cast(destination_); + size_t stride_float = stride / sizeof(float); + + int component_exp[64]; + assert(stride_float <= sizeof(component_exp) / sizeof(int)); + + const int min_exp = -100; + + if (mode == meshopt_EncodeExpSharedComponent) + { + for (size_t j = 0; j < stride_float; ++j) + component_exp[j] = min_exp; + + for (size_t i = 0; i < count; ++i) + { + const float* v = &data[i * stride_float]; + + // use maximum exponent to encode values; this guarantees that mantissa is [-1, 1] + for (size_t j = 0; j < stride_float; ++j) + { + int e = optlog2(v[j]); + + component_exp[j] = (component_exp[j] < e) ? e : component_exp[j]; + } + } + } + + for (size_t i = 0; i < count; ++i) + { + const float* v = &data[i * stride_float]; + unsigned int* d = &destination[i * stride_float]; + + int vector_exp = min_exp; + + if (mode == meshopt_EncodeExpSharedVector) + { + // use maximum exponent to encode values; this guarantees that mantissa is [-1, 1] + for (size_t j = 0; j < stride_float; ++j) + { + int e = optlog2(v[j]); + + vector_exp = (vector_exp < e) ? e : vector_exp; + } + } + else if (mode == meshopt_EncodeExpSeparate) + { + for (size_t j = 0; j < stride_float; ++j) + { + int e = optlog2(v[j]); + + component_exp[j] = (min_exp < e) ? e : min_exp; + } + } + + for (size_t j = 0; j < stride_float; ++j) + { + int exp = (mode == meshopt_EncodeExpSharedVector) ? vector_exp : component_exp[j]; + + // note that we additionally scale the mantissa to make it a K-bit signed integer (K-1 bits for magnitude) + exp -= (bits - 1); + + // compute renormalized rounded mantissa for each component + int mmask = (1 << 24) - 1; + + int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f)); + + d[j] = (m & mmask) | (unsigned(exp) << 24); + } + } +} + #undef SIMD_SSE #undef SIMD_NEON #undef SIMD_WASM