diff --git a/README.md b/README.md
index 9cbc481..b5097da 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ If you want to use the original C++ implementation from Rust, check out the [mes
 
 ## Features
 
-`meshoptimizer` v0.18 feature level is the current support target. Planned but currently missing features:
+`meshoptimizer` v0.20 feature level is the current support target. Planned but currently missing features:
 - [SIMD support](https://github.com/yzsolt/meshopt-rs/issues/1)
 - [WASM support](https://github.com/yzsolt/meshopt-rs/issues/2)
 
diff --git a/benches/with_input.rs b/benches/with_input.rs
index efff25c..eb80f5f 100644
--- a/benches/with_input.rs
+++ b/benches/with_input.rs
@@ -10,7 +10,7 @@ use meshopt_rs::index::generator::{
 use meshopt_rs::index::sequence::{decode_index_sequence, encode_index_sequence, encode_index_sequence_bound};
 use meshopt_rs::overdraw::optimize_overdraw;
 use meshopt_rs::stripify::{stripify, stripify_bound};
-use meshopt_rs::vertex::Position;
+use meshopt_rs::vertex::Vertex;
 use meshopt_rs::vertex::cache::{optimize_vertex_cache, optimize_vertex_cache_fifo, optimize_vertex_cache_strip};
 use meshopt_rs::vertex::fetch::{optimize_vertex_fetch, optimize_vertex_fetch_remap};
 
@@ -19,13 +19,13 @@ use std::path::Path;
 
 #[derive(Clone, Copy, Default)]
 #[repr(C)]
-struct Vertex {
+struct BenchVertex {
     p: [f32; 3],
     n: [f32; 3],
     t: [f32; 2],
 }
 
-impl Position for Vertex {
+impl Vertex for BenchVertex {
     fn pos(&self) -> [f32; 3] {
         self.p
     }
@@ -33,7 +33,7 @@ impl Position for Vertex {
 
 #[derive(Clone, Default)]
 struct Mesh {
-    vertices: Vec<Vertex>,
+    vertices: Vec<BenchVertex>,
     indices: Vec<u32>,
 }
 
@@ -61,7 +61,7 @@ impl Mesh {
             indices.extend_from_slice(&mesh.indices);
 
             for i in 0..mesh.indices.len() {
-                let mut vertex = Vertex::default();
+                let mut vertex = BenchVertex::default();
 
                 let pi = mesh.indices[i] as usize;
                 vertex.p.copy_from_slice(&mesh.positions[3 * pi..3 * (pi + 1)]);
@@ -89,7 +89,7 @@ impl Mesh {
 
         result.indices = remap;
 
-        result.vertices.resize(total_vertices, Vertex::default());
+        result.vertices.truncate(total_vertices);
         remap_vertex_buffer(&mut result.vertices, &vertices, &result.indices);
 
         Ok(result)
diff --git a/bin/codec_bench.rs b/bin/codec_bench.rs
index 6d1dbfc..8029f9a 100644
--- a/bin/codec_bench.rs
+++ b/bin/codec_bench.rs
@@ -2,7 +2,7 @@
 
 use meshopt_rs::index::IndexEncodingVersion;
 use meshopt_rs::index::buffer::{decode_index_buffer, encode_index_buffer, encode_index_buffer_bound};
-use meshopt_rs::vertex::Position;
+use meshopt_rs::vertex::Vertex;
 use meshopt_rs::vertex::VertexEncodingVersion;
 use meshopt_rs::vertex::buffer::{decode_vertex_buffer, encode_vertex_buffer, encode_vertex_buffer_bound};
 use meshopt_rs::vertex::cache::{optimize_vertex_cache, optimize_vertex_cache_strip};
@@ -12,11 +12,11 @@ use std::time::Instant;
 
 #[derive(Clone, Copy, Default)]
 #[repr(C)]
-struct Vertex {
+struct BenchVertex {
     data: [u16; 16],
 }
 
-impl Position for Vertex {
+impl Vertex for BenchVertex {
     fn pos(&self) -> [f32; 3] {
         let get_f32 = |start: usize| {
             let a = self.data[start].to_le_bytes();
@@ -38,11 +38,11 @@ fn murmur3(mut h: u32) -> u32 {
     h
 }
 
-fn bench_codecs(vertices: &[Vertex], indices: &[u32], bestvd: &mut f64, bestid: &mut f64, verbose: bool) {
-    let mut vb = vec![Vertex::default(); vertices.len()];
+fn bench_codecs(vertices: &[BenchVertex], indices: &[u32], bestvd: &mut f64, bestid: &mut f64, verbose: bool) {
+    let mut vb = vec![BenchVertex::default(); vertices.len()];
     let mut ib = vec![0u32; indices.len()];
 
-    let mut vc = vec![0u8; encode_vertex_buffer_bound(vertices.len(), std::mem::size_of::<Vertex>())];
+    let mut vc = vec![0u8; encode_vertex_buffer_bound(vertices.len(), std::mem::size_of::<BenchVertex>())];
     let mut ic = vec![0u8; encode_index_buffer_bound(indices.len(), vertices.len())];
 
     if verbose {
@@ -203,7 +203,7 @@ fn main() {
 
     for x in 0..=N {
         for y in 0..=N {
-            let mut v = Vertex::default();
+            let mut v = BenchVertex::default();
 
             for k in 0..16 {
                 let h = murmur3((x * (N + 1) + y) * 16 + k);
diff --git a/examples/demo.rs b/examples/demo.rs
index 320e41f..e95948c 100644
--- a/examples/demo.rs
+++ b/examples/demo.rs
@@ -8,6 +8,7 @@ use meshopt_rs::index::*;
 use meshopt_rs::overdraw::*;
 use meshopt_rs::quantize::*;
 use meshopt_rs::simplify::*;
+use meshopt_rs::spatial_order::*;
 use meshopt_rs::stripify::*;
 use meshopt_rs::vertex::buffer::*;
 use meshopt_rs::vertex::cache::*;
@@ -15,9 +16,6 @@ use meshopt_rs::vertex::fetch::*;
 use meshopt_rs::vertex::*;
 use meshopt_rs::{INVALID_INDEX, Stream};
 
-#[cfg(feature = "experimental")]
-use meshopt_rs::spatial_order::*;
-
 use std::env;
 use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
@@ -26,33 +24,43 @@ use std::time::Instant;
 
 #[derive(Clone, Copy, Default, Debug)]
 #[repr(C)]
-struct Vertex {
+struct DemoVertex {
     p: [f32; 3],
     n: [f32; 3],
     t: [f32; 2],
 }
 
-impl Vertex {
+impl DemoVertex {
     fn as_bytes(&self) -> &[u8] {
         unsafe { std::slice::from_raw_parts((self as *const Self) as *const u8, std::mem::size_of::<Self>()) }
     }
 }
 
-impl Hash for Vertex {
+impl Hash for DemoVertex {
     fn hash<H: Hasher>(&self, state: &mut H) {
         state.write(self.as_bytes());
     }
 }
 
-impl PartialEq for Vertex {
+impl PartialEq for DemoVertex {
     fn eq(&self, other: &Self) -> bool {
         self.as_bytes() == other.as_bytes()
     }
 }
 
-impl Eq for Vertex {}
+impl Eq for DemoVertex {}
 
-impl Position for Vertex {
+impl Vertex<3> for DemoVertex {
+    fn pos(&self) -> [f32; 3] {
+        self.p
+    }
+
+    fn attrs(&self) -> [f32; 3] {
+        self.n
+    }
+}
+
+impl Vertex<0> for DemoVertex {
     fn pos(&self) -> [f32; 3] {
         self.p
     }
@@ -60,7 +68,7 @@ impl Position for Vertex {
 
 #[derive(Clone, Default)]
 struct Mesh {
-    vertices: Vec<Vertex>,
+    vertices: Vec<DemoVertex>,
     indices: Vec<u32>,
 }
 
@@ -90,7 +98,7 @@ impl Mesh {
             indices.extend_from_slice(&mesh.indices);
 
             for i in 0..mesh.indices.len() {
-                let mut vertex = Vertex::default();
+                let mut vertex = DemoVertex::default();
 
                 let pi = mesh.indices[i] as usize;
                 let ni = mesh.normal_indices[i] as usize;
@@ -122,7 +130,7 @@ impl Mesh {
 
         result.indices = remap;
 
-        result.vertices.resize(total_vertices, Vertex::default());
+        result.vertices.resize(total_vertices, DemoVertex::default());
         remap_vertex_buffer(&mut result.vertices, &vertices, &result.indices);
 
         let indexed = start.elapsed();
@@ -147,7 +155,7 @@ impl Mesh {
         self.indices.iter().all(|i| (*i as usize) < self.vertices.len())
     }
 
-    fn rotate_triangle(t: &mut [Vertex; 3]) -> bool {
+    fn rotate_triangle(t: &mut [DemoVertex; 3]) -> bool {
         use std::cmp::Ordering;
 
         let c01 = t[0].as_bytes().cmp(t[1].as_bytes());
@@ -202,7 +210,7 @@ impl Mesh {
             // skip degenerate triangles since some algorithms don't preserve them
             if Self::rotate_triangle(&mut v) {
                 let data = unsafe {
-                    std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of::<Vertex>() * v.len())
+                    std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of::<DemoVertex>() * v.len())
                 };
 
                 let hash = Self::hash_range(data);
@@ -304,7 +312,7 @@ struct PackedVertex {
     t: [u16; 2],
 }
 
-fn pack_mesh(pv: &mut [PackedVertex], vertices: &[Vertex]) {
+fn pack_mesh(pv: &mut [PackedVertex], vertices: &[DemoVertex]) {
     for i in 0..vertices.len() {
         let vi = vertices[i];
         let pvi = &mut pv[i];
@@ -332,7 +340,7 @@ struct PackedVertexOct {
     t: [u16; 2],
 }
 
-fn pack_mesh_oct(pv: &mut [PackedVertexOct], vertices: &[Vertex]) {
+fn pack_mesh_oct(pv: &mut [PackedVertexOct], vertices: &[DemoVertex]) {
     for i in 0..vertices.len() {
         let vi = vertices[i];
         let pvi = &mut pv[i];
@@ -378,7 +386,7 @@ where
     assert_eq!(mesh.hash(), copy.hash());
 
     let vcs = analyze_vertex_cache(&copy.indices, copy.vertices.len(), CACHE_SIZE, 0, 0);
-    let vfs = analyze_vertex_fetch(&copy.indices, copy.vertices.len(), std::mem::size_of::<Vertex>());
+    let vfs = analyze_vertex_fetch(&copy.indices, copy.vertices.len(), std::mem::size_of::<DemoVertex>());
     let os = analyze_overdraw(&copy.indices, &copy.vertices);
 
     let vcs_nv = analyze_vertex_cache(&copy.indices, copy.vertices.len(), 32, 32, 32);
@@ -586,14 +594,10 @@ fn simplify_mesh(mesh: &Mesh) {
     );
     lod.indices.resize(size, 0);
 
-    let size = if lod.indices.len() < mesh.vertices.len() {
-        lod.indices.len()
-    } else {
-        mesh.vertices.len()
-    };
-    lod.vertices.resize(size, Vertex::default()); // note: this is just to reduce the cost of relen()
+    let size = lod.indices.len().min(mesh.vertices.len());
+    lod.vertices.resize(size, DemoVertex::default()); // note: this is just to reduce the cost of resize()
     let size = optimize_vertex_fetch(&mut lod.vertices, &mut lod.indices, &mesh.vertices);
-    lod.vertices.resize(size, Vertex::default());
+    lod.vertices.resize(size, DemoVertex::default());
 
     let duration = start.elapsed();
 
@@ -607,6 +611,49 @@ fn simplify_mesh(mesh: &Mesh) {
     );
 }
 
+#[cfg(feature = "experimental")]
+fn simplify_attr(mesh: &Mesh, threshold: f32) {
+    let mut lod = Mesh::default();
+
+    let start = Instant::now();
+
+    let target_index_count = (mesh.indices.len() as f32 * threshold) as usize;
+    let target_error = 1e-2;
+    let mut result_error = 0.0;
+
+    const NRM_WEIGHT: f32 = 0.01;
+    const ATTR_WEIGHTS: [f32; 3] = [NRM_WEIGHT, NRM_WEIGHT, NRM_WEIGHT];
+
+    lod.indices.resize(mesh.indices.len(), 0); // note: simplify needs space for index_count elements in the destination array, not target_index_count
+    let size = simplify_with_attributes::<DemoVertex, 3>(
+        &mut lod.indices,
+        &mesh.indices,
+        &mesh.vertices,
+        &ATTR_WEIGHTS,
+        target_index_count,
+        target_error,
+        SimplificationOptions::empty(),
+        Some(&mut result_error),
+    );
+    lod.indices.truncate(size);
+
+    let size = lod.indices.len().min(mesh.vertices.len());
+    lod.vertices.resize(size, DemoVertex::default()); // note: this is just to reduce the cost of resize()
+    let size = optimize_vertex_fetch(&mut lod.vertices, &mut lod.indices, &mesh.vertices);
+    lod.vertices.truncate(size);
+
+    let duration = start.elapsed();
+
+    println!(
+        "{:9}: {} triangles => {} triangles ({:.2}% deviation) in {:.2} msec",
+        "SimplifyAttr",
+        mesh.indices.len() / 3,
+        lod.indices.len() / 3,
+        result_error * 100.0,
+        duration.as_micros() as f64 / 1000.0
+    );
+}
+
 #[cfg(feature = "experimental")]
 fn simplify_mesh_sloppy(mesh: &Mesh, threshold: f32) {
     let mut lod = Mesh::default();
@@ -658,7 +705,7 @@ fn simplify_mesh_points(mesh: &Mesh, threshold: f32) {
     let target_vertex_count = (mesh.vertices.len() as f32 * threshold) as usize;
 
     let mut indices = vec![0; target_vertex_count];
-    let size = simplify_points(&mut indices, &mesh.vertices, target_vertex_count);
+    let size = simplify_points(&mut indices, &mesh.vertices, target_vertex_count, 0.0);
     indices.resize(size, Default::default());
 
     let duration = start.elapsed();
@@ -782,7 +829,7 @@ fn simplify_mesh_complete(mesh: &Mesh) {
         let vfs_0 = analyze_vertex_fetch(
             &indices[offset0..offset0 + lod_index_counts[0]],
             vertices.len(),
-            std::mem::size_of::<Vertex>(),
+            std::mem::size_of::<DemoVertex>(),
         );
         let offsetn = lod_index_offsets[LOD_COUNT - 1];
         let vcs_n = analyze_vertex_cache(
@@ -795,7 +842,7 @@ fn simplify_mesh_complete(mesh: &Mesh) {
         let vfs_n = analyze_vertex_fetch(
             &indices[offsetn..offsetn + lod_index_counts[LOD_COUNT - 1]],
             vertices.len(),
-            std::mem::size_of::<Vertex>(),
+            std::mem::size_of::<DemoVertex>(),
         );
 
         let mut pv = vec![PackedVertexOct::default(); vertices.len()];
@@ -1057,7 +1104,6 @@ fn meshlets(mesh: &Mesh, scan: bool) {
     );
 }
 
-#[cfg(feature = "experimental")]
 fn spatial_sort_mesh(mesh: &Mesh) {
     let mut pv = vec![PackedVertexOct::default(); mesh.vertices.len()];
     pack_mesh_oct(&mut pv, &mesh.vertices);
@@ -1308,17 +1354,21 @@ fn process(mesh: &Mesh) {
 
     #[cfg(feature = "experimental")]
     {
+        simplify_attr(mesh, 0.2);
         simplify_mesh_sloppy(mesh, 0.2);
         simplify_mesh_complete(mesh);
         simplify_mesh_points(mesh, 0.2);
-
-        spatial_sort_mesh(mesh);
-        spatial_sort_mesh_triangles(mesh);
     }
+
+    spatial_sort_mesh(mesh);
+
+    #[cfg(feature = "experimental")]
+    spatial_sort_mesh_triangles(mesh);
 }
 
-fn process_dev(mesh: &Mesh) {
-    meshlets(mesh, false);
+fn process_dev(#[allow(unused)] mesh: &Mesh) {
+    #[cfg(feature = "experimental")]
+    simplify_attr(mesh, 0.2);
 }
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
diff --git a/src/cluster.rs b/src/cluster.rs
index c2e16be..06e6406 100644
--- a/src/cluster.rs
+++ b/src/cluster.rs
@@ -2,7 +2,7 @@
 
 use crate::quantize::quantize_snorm;
 use crate::util::zero_inverse;
-use crate::vertex::{Position, TriangleAdjacency, build_triangle_adjacency};
+use crate::vertex::{TriangleAdjacency, Vertex, build_triangle_adjacency};
 
 const UNUSED: u8 = 0xff;
 
@@ -127,7 +127,7 @@ struct Cone {
     nz: f32,
 }
 
-impl Position for Cone {
+impl Vertex for Cone {
     fn pos(&self) -> [f32; 3] {
         [self.px, self.py, self.pz]
     }
@@ -163,9 +163,9 @@ fn get_meshlet_cone(acc: &Cone, triangle_count: u32) -> Cone {
     result
 }
 
-fn compute_triangle_cones<Vertex>(triangles: &mut [Cone], indices: &[u32], vertices: &[Vertex]) -> f32
+fn compute_triangle_cones<V>(triangles: &mut [Cone], indices: &[u32], vertices: &[V]) -> f32
 where
-    Vertex: Position,
+    V: Vertex,
 {
     let face_count = indices.len() / 3;
 
@@ -305,7 +305,7 @@ impl Default for KdNodeType {
 
 fn kd_tree_partition<Point>(indices: &mut [u32], points: &[Point], axis: u32, pivot: f32) -> usize
 where
-    Point: Position,
+    Point: Vertex,
 {
     let mut m = 0;
 
@@ -352,7 +352,7 @@ fn kd_tree_build<Point>(
     leaf_size: usize,
 ) -> usize
 where
-    Point: Position,
+    Point: Vertex,
 {
     assert!(!indices.is_empty());
 
@@ -425,7 +425,7 @@ fn kd_tree_nearest<Point>(
     result: &mut u32,
     limit: &mut f32,
 ) where
-    Point: Position,
+    Point: Vertex,
 {
     let node = &nodes[root as usize];
 
@@ -642,18 +642,18 @@ fn get_neighbor_triangle(
 /// * `max_vertices` and `max_triangles`: must not exceed implementation limits (`max_vertices` <= 255 - not 256!, `max_triangles` <= 512)
 /// * `cone_weight`: should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
 #[allow(clippy::too_many_arguments)]
-pub fn build_meshlets<Vertex>(
+pub fn build_meshlets<V>(
     meshlets: &mut [Meshlet],
     meshlet_vertices: &mut [u32],
     meshlet_triangles: &mut [u8],
     indices: &[u32],
-    vertices: &[Vertex],
+    vertices: &[V],
     max_vertices: usize,
     max_triangles: usize,
     cone_weight: f32,
 ) -> usize
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert!(indices.len().is_multiple_of(3));
 
@@ -825,7 +825,7 @@ where
 /// dot(view, cone_axis) >= cone_cutoff
 /// ```
 ///
-/// For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+/// For perspective projection, you can use the formula that needs cone apex in addition to axis & cutoff:
 /// ```glsl
 /// dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
 /// ```
@@ -846,9 +846,9 @@ where
 /// # Arguments
 ///
 /// * `indices`: should be smaller than or equal to 256*3 (the function assumes clusters of limited size)
-pub fn compute_cluster_bounds<Vertex>(indices: &[u32], vertices: &[Vertex]) -> Bounds
+pub fn compute_cluster_bounds<V>(indices: &[u32], vertices: &[V]) -> Bounds
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert!(indices.len().is_multiple_of(3));
     assert!(indices.len() / 3 <= MESHLET_MAX_TRIANGLES);
@@ -1005,9 +1005,9 @@ where
 /// Creates bounding volumes that can be used for frustum, backface and occlusion culling.
 ///
 /// Same as [compute_cluster_bounds] but with meshlets as input.
-pub fn compute_meshlet_bounds<Vertex>(meshlet_vertices: &[u32], meshlet_triangles: &[u8], vertices: &[Vertex]) -> Bounds
+pub fn compute_meshlet_bounds<V>(meshlet_vertices: &[u32], meshlet_triangles: &[u8], vertices: &[V]) -> Bounds
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert_eq!(meshlet_triangles.len() % 3, 0);
 
@@ -1030,22 +1030,22 @@ where
 mod test {
     use super::*;
 
-    struct Vertex {
+    struct TestVertex {
         x: f32,
         y: f32,
         z: f32,
     }
 
-    impl Position for Vertex {
+    impl Vertex for TestVertex {
         fn pos(&self) -> [f32; 3] {
             [self.x, self.y, self.z]
         }
     }
 
-    fn vb_from_slice(slice: &[f32]) -> Vec<Vertex> {
+    fn vb_from_slice(slice: &[f32]) -> Vec<TestVertex> {
         slice
             .chunks_exact(3)
-            .map(|v| Vertex {
+            .map(|v| TestVertex {
                 x: v[0],
                 y: v[1],
                 z: v[2],
@@ -1060,7 +1060,7 @@ mod test {
         let ib1 = [0, 1, 2];
 
         // all of the bounds below are degenerate as they use 0 triangles, one topology-degenerate triangle and one position-degenerate triangle respectively
-        let bounds0 = compute_cluster_bounds::<Vertex>(&[], &[]);
+        let bounds0 = compute_cluster_bounds::<TestVertex>(&[], &[]);
         let boundsd = compute_cluster_bounds(&ibd, &vbd);
         let bounds1 = compute_cluster_bounds(&ib1, &vbd);
 
diff --git a/src/lib.rs b/src/lib.rs
index 63dde78..ef92d3b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,15 +13,14 @@ pub mod index;
 pub mod overdraw;
 pub mod quantize;
 pub mod simplify;
-#[cfg(feature = "experimental")]
 pub mod spatial_order;
 pub mod stripify;
 pub mod util;
 pub mod vertex;
 
-use std::ops::Range;
+use std::ops::{Range, Sub, SubAssign};
 
-use crate::vertex::Position;
+use crate::vertex::Vertex;
 
 pub const INVALID_INDEX: u32 = u32::MAX;
 
@@ -134,8 +133,16 @@ impl Vector3 {
         Self { x, y, z }
     }
 
+    pub fn length_squared(&self) -> f32 {
+        self.x * self.x + self.y * self.y + self.z * self.z
+    }
+
+    pub fn length(&self) -> f32 {
+        self.length_squared().sqrt()
+    }
+
     pub fn normalize(&mut self) -> f32 {
-        let length = (self.x * self.x + self.y * self.y + self.z * self.z).sqrt();
+        let length = self.length();
 
         if length > 0.0 {
             self.x /= length;
@@ -147,7 +154,35 @@ impl Vector3 {
     }
 }
 
-impl Position for Vector3 {
+impl From<[f32; 3]> for Vector3 {
+    fn from(value: [f32; 3]) -> Self {
+        Self {
+            x: value[0],
+            y: value[1],
+            z: value[2],
+        }
+    }
+}
+
+impl SubAssign for Vector3 {
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+
+impl Sub for Vector3 {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+            z: self.z - rhs.z,
+        }
+    }
+}
+
+impl Vertex for Vector3 {
     fn pos(&self) -> [f32; 3] {
         [self.x, self.y, self.z]
     }
diff --git a/src/overdraw.rs b/src/overdraw.rs
index b10e521..d7e438f 100644
--- a/src/overdraw.rs
+++ b/src/overdraw.rs
@@ -3,7 +3,7 @@
 use crate::Vector3;
 use crate::quantize::quantize_unorm;
 use crate::util::zero_inverse;
-use crate::vertex::{Position, calc_pos_extents};
+use crate::vertex::{Vertex, calc_pos_extents};
 
 const VIEWPORT: usize = 256;
 
@@ -140,9 +140,9 @@ fn rasterize(buffer: &mut OverdrawBuffer, mut v1: Vector3, mut v2: Vector3, mut
 /// Returns overdraw statistics using a software rasterizer.
 ///
 /// Results may not match actual GPU performance.
-pub fn analyze_overdraw<Vertex>(indices: &[u32], vertices: &[Vertex]) -> OverdrawStatistics
+pub fn analyze_overdraw<V>(indices: &[u32], vertices: &[V]) -> OverdrawStatistics
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert!(indices.len().is_multiple_of(3));
 
@@ -218,9 +218,9 @@ where
     result
 }
 
-fn calculate_sort_data<Vertex>(sort_data: &mut [f32], indices: &[u32], vertices: &[Vertex], clusters: &[u32])
+fn calculate_sort_data<V>(sort_data: &mut [f32], indices: &[u32], vertices: &[V], clusters: &[u32])
 where
-    Vertex: Position,
+    V: Vertex,
 {
     let mut mesh_centroid = [0.0; 3];
 
@@ -511,9 +511,9 @@ fn generate_soft_boundaries(
 /// * `destination`: must contain enough space for the resulting index buffer (`indices.len()` elements)
 /// * `indices`: must contain index data that is the result of [optimize_vertex_cache](crate::vertex::cache::optimize_vertex_cache) (**not** the original mesh indices!)
 /// * `threshold`: indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
-pub fn optimize_overdraw<Vertex>(destination: &mut [u32], indices: &[u32], vertices: &[Vertex], threshold: f32)
+pub fn optimize_overdraw<V>(destination: &mut [u32], indices: &[u32], vertices: &[V], threshold: f32)
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert_eq!(indices.len() % 3, 0);
 
@@ -583,7 +583,7 @@ mod test {
 
     struct DummyVertex;
 
-    impl Position for DummyVertex {
+    impl Vertex for DummyVertex {
         fn pos(&self) -> [f32; 3] {
             [0.0; 3]
         }
diff --git a/src/quantize.rs b/src/quantize.rs
index c2504e7..6e85545 100644
--- a/src/quantize.rs
+++ b/src/quantize.rs
@@ -28,7 +28,7 @@ pub fn quantize_snorm(mut v: f32, n: u32) -> i32 {
     (v * scale + round) as i32
 }
 
-/// Quantizes a float into half-precision floating point value.
+/// Quantizes a float into half-precision (as defined by IEEE-754 fp16) floating point value.
 ///
 /// Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest.
 ///
@@ -56,7 +56,7 @@ pub fn quantize_half(v: f32) -> u16 {
     (s | h) as u16
 }
 
-/// Quantizes a float into a floating point value with a limited number of significant mantissa bits.
+/// Quantizes a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
 ///
 /// Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest.
 ///
@@ -78,3 +78,121 @@ pub fn quantize_float(v: f32, n: i32) -> f32 {
 
     f32::from_bits(ui)
 }
+
+/// Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+///
+/// Preserves Inf/NaN, flushes denormals to zero
+pub fn dequantize_half(h: u16) -> f32 {
+    let s = ((h & 0x8000) as u32) << 16;
+    let em = (h & 0x7fff) as u32;
+
+    // bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+    let mut r = (em + (112 << 10)) << 13;
+
+    // denormal: flush to zero
+    r = if em < (1 << 10) { 0 } else { r };
+
+    // infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+    // 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+    r += if em >= (31 << 10) { 112 << 23 } else { 0 };
+
+    let ui = s | r;
+
+    f32::from_bits(ui)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_quantize_float() {
+        assert_eq!(quantize_float(1.2345, 23), 1.2345);
+
+        assert_eq!(quantize_float(1.2345, 16), 1.2344971);
+        assert_eq!(quantize_float(1.2345, 8), 1.2343750);
+        assert_eq!(quantize_float(1.2345, 4), 1.25);
+        assert_eq!(quantize_float(1.2345, 1), 1.0);
+
+        assert_eq!(quantize_float(1.0, 0), 1.0);
+
+        assert_eq!(quantize_float(1.0 / 0.0, 0), 1.0 / 0.0);
+        assert_eq!(quantize_float(-1.0 / 0.0, 0), -1.0 / 0.0);
+
+        let nanf = quantize_float(0.0 / 0.0, 8);
+        assert!(nanf.is_nan());
+    }
+
+    #[test]
+    fn test_quantize_half() {
+        // normal
+        assert_eq!(quantize_half(1.2345), 0x3cf0);
+
+        // overflow
+        assert_eq!(quantize_half(65535.0), 0x7c00);
+        assert_eq!(quantize_half(-65535.0), 0xfc00);
+
+        // large
+        assert_eq!(quantize_half(65000.0), 0x7bef);
+        assert_eq!(quantize_half(-65000.0), 0xfbef);
+
+        // small
+        assert_eq!(quantize_half(0.125), 0x3000);
+        assert_eq!(quantize_half(-0.125), 0xb000);
+
+        // very small
+        assert_eq!(quantize_half(1e-4), 0x068e);
+        assert_eq!(quantize_half(-1e-4), 0x868e);
+
+        // underflow
+        assert_eq!(quantize_half(1e-5), 0x0000);
+        assert_eq!(quantize_half(-1e-5), 0x8000);
+
+        // exponent underflow
+        assert_eq!(quantize_half(1e-20), 0x0000);
+        assert_eq!(quantize_half(-1e-20), 0x8000);
+
+        // exponent overflow
+        assert_eq!(quantize_half(1e20), 0x7c00);
+        assert_eq!(quantize_half(-1e20), 0xfc00);
+
+        // inf
+        assert_eq!(quantize_half(1.0 / 0.0), 0x7c00);
+        assert_eq!(quantize_half(-1.0 / 0.0), 0xfc00);
+
+        // nan
+        let nanh = quantize_half(0.0 / 0.0);
+        assert!(nanh == 0x7e00 || nanh == 0xfe00);
+    }
+
+    #[test]
+    fn test_dequantize_half() {
+        // normal
+        assert_eq!(dequantize_half(0x3cf0), 1.234375);
+
+        // large
+        assert_eq!(dequantize_half(0x7bef), 64992.0);
+        assert_eq!(dequantize_half(0xfbef), -64992.0);
+
+        // small
+        assert_eq!(dequantize_half(0x3000), 0.125);
+        assert_eq!(dequantize_half(0xb000), -0.125);
+
+        // very small
+        assert_eq!(dequantize_half(0x068e), 1.00016594e-4);
+        assert_eq!(dequantize_half(0x868e), -1.00016594e-4);
+
+        // denormal
+        assert_eq!(dequantize_half(0x00ff), 0.0);
+        assert_eq!(dequantize_half(0x80ff), 0.0); // actually this is -0.0
+        assert_eq!(1.0 / dequantize_half(0x80ff), -1.0 / 0.0);
+
+        // inf
+        assert_eq!(dequantize_half(0x7c00), 1.0 / 0.0);
+        assert_eq!(dequantize_half(0xfc00), -1.0 / 0.0);
+
+        // nan
+        let nanf = dequantize_half(0x7e00);
+        assert!(nanf.is_nan());
+    }
+}
diff --git a/src/simplify.rs b/src/simplify.rs
index 60c9e39..5755ffe 100644
--- a/src/simplify.rs
+++ b/src/simplify.rs
@@ -1,16 +1,27 @@
 //! Mesh and point cloud simplification
+
+// This work is based on:
+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+// Michael Garland. Quadric-based polygonal surface simplification. 1999
+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
+
 use bitflags::bitflags;
 
 use crate::INVALID_INDEX;
 use crate::Vector3;
 use crate::hash::BuildNoopHasher;
 use crate::util::zero_inverse;
-use crate::vertex::{Position, calc_pos_extents};
+use crate::vertex::{Vertex, calc_pos_extents};
 
 use std::collections::{HashMap, hash_map::Entry};
 use std::fmt::Debug;
 use std::ops::AddAssign;
 
+const MAX_ATTRIBUTES: usize = 16;
+
 #[derive(Clone, Default)]
 struct Edge {
     next: u32,
@@ -19,22 +30,22 @@ struct Edge {
 
 #[derive(Default)]
 struct EdgeAdjacency {
-    counts: Vec<u32>,
     offsets: Vec<u32>,
     data: Vec<Edge>,
 }
 
 fn prepare_edge_adjacency(adjacency: &mut EdgeAdjacency, index_count: usize, vertex_count: usize) {
-    adjacency.counts = vec![0; vertex_count];
-    adjacency.offsets = vec![0; vertex_count];
+    adjacency.offsets = vec![0; vertex_count + 1];
     adjacency.data = vec![Edge::default(); index_count];
 }
 
 fn update_edge_adjacency(adjacency: &mut EdgeAdjacency, indices: &[u32], remap: Option<&[u32]>) {
     let face_count = indices.len() / 3;
 
+    let offsets = &mut adjacency.offsets[1..];
+
     // fill edge counts
-    adjacency.counts.fill(0);
+    offsets.fill(0);
 
     for index in indices {
         let v = if let Some(r) = remap {
@@ -43,15 +54,16 @@ fn update_edge_adjacency(adjacency: &mut EdgeAdjacency, indices: &[u32], remap:
             *index as usize
         };
 
-        adjacency.counts[v] += 1;
+        offsets[v] += 1;
     }
 
     // fill offset table
     let mut offset = 0;
 
-    for (o, count) in adjacency.offsets.iter_mut().zip(adjacency.counts.iter()) {
-        *o = offset;
-        offset += *count;
+    for count in offsets.iter_mut() {
+        let c = *count;
+        *count = offset;
+        offset += c;
     }
 
     assert_eq!(offset as usize, indices.len());
@@ -68,25 +80,22 @@ fn update_edge_adjacency(adjacency: &mut EdgeAdjacency, indices: &[u32], remap:
             c = r[c as usize];
         }
 
-        adjacency.data[adjacency.offsets[a as usize] as usize].next = b;
-        adjacency.data[adjacency.offsets[a as usize] as usize].prev = c;
-        adjacency.offsets[a as usize] += 1;
+        adjacency.data[offsets[a as usize] as usize].next = b;
+        adjacency.data[offsets[a as usize] as usize].prev = c;
+        offsets[a as usize] += 1;
 
-        adjacency.data[adjacency.offsets[b as usize] as usize].next = c;
-        adjacency.data[adjacency.offsets[b as usize] as usize].prev = a;
-        adjacency.offsets[b as usize] += 1;
+        adjacency.data[offsets[b as usize] as usize].next = c;
+        adjacency.data[offsets[b as usize] as usize].prev = a;
+        offsets[b as usize] += 1;
 
-        adjacency.data[adjacency.offsets[c as usize] as usize].next = a;
-        adjacency.data[adjacency.offsets[c as usize] as usize].prev = b;
-        adjacency.offsets[c as usize] += 1;
+        adjacency.data[offsets[c as usize] as usize].next = a;
+        adjacency.data[offsets[c as usize] as usize].prev = b;
+        offsets[c as usize] += 1;
     }
 
-    // fix offsets that have been disturbed by the previous pass
-    for (offset, count) in adjacency.offsets.iter_mut().zip(adjacency.counts.iter()) {
-        assert!(*offset >= *count);
-
-        *offset -= *count;
-    }
+    // finalize offsets
+    adjacency.offsets[0] = 0;
+    assert_eq!(*adjacency.offsets.last().unwrap() as usize, indices.len());
 }
 
 mod hash {
@@ -128,9 +137,9 @@ mod hash {
     impl Eq for VertexPosition {}
 }
 
-fn build_position_remap<Vertex>(remap: &mut [u32], wedge: &mut [u32], vertices: &[Vertex])
+fn build_position_remap<V, const ATTR_COUNT: usize>(remap: &mut [u32], wedge: &mut [u32], vertices: &[V])
 where
-    Vertex: Position,
+    V: Vertex<ATTR_COUNT>,
 {
     let mut table = HashMap::with_capacity_and_hasher(vertices.len(), BuildNoopHasher::default());
 
@@ -213,10 +222,10 @@ const HAS_OPPOSITE: [[bool; KIND_COUNT]; KIND_COUNT] = [
 ];
 
 fn has_edge(adjacency: &EdgeAdjacency, a: u32, b: u32) -> bool {
-    let count = adjacency.counts[a as usize] as usize;
-    let offset = adjacency.offsets[a as usize] as usize;
+    let start = adjacency.offsets[a as usize] as usize;
+    let end = adjacency.offsets[a as usize + 1] as usize;
 
-    let edges = &adjacency.data[offset..offset + count];
+    let edges = &adjacency.data[start..end];
 
     edges.iter().any(|d| d.next == b)
 }
@@ -240,10 +249,10 @@ fn classify_vertices(
 
     #[allow(clippy::needless_range_loop)]
     for vertex in 0..vertex_count {
-        let offset = adjacency.offsets[vertex] as usize;
-        let count = adjacency.counts[vertex] as usize;
+        let start = adjacency.offsets[vertex] as usize;
+        let end = adjacency.offsets[vertex + 1] as usize;
 
-        let edges = &adjacency.data[offset..offset + count];
+        let edges = &adjacency.data[start..end];
 
         for edge in edges {
             let target = edge.next;
@@ -362,9 +371,9 @@ fn classify_vertices(
     );
 }
 
-fn rescale_positions<Vertex>(result: &mut [Vector3], vertices: &[Vertex]) -> f32
+fn rescale_positions<V, const ATTR_COUNT: usize>(result: &mut [Vector3], vertices: &[V]) -> f32
 where
-    Vertex: Position,
+    V: Vertex<ATTR_COUNT>,
 {
     let (minv, extent) = calc_pos_extents(vertices);
 
@@ -389,6 +398,27 @@ where
     extent
 }
 
+fn rescale_attributes<V, const ATTR_COUNT: usize>(
+    vertices: &[V],
+    attribute_weights: &[f32; ATTR_COUNT],
+) -> Vec<[f32; ATTR_COUNT]>
+where
+    V: Vertex<ATTR_COUNT>,
+{
+    let mut vertex_weighted_attrs = vec![[0f32; ATTR_COUNT]; vertices.len()];
+
+    for (weighted_attrs, vertex) in vertex_weighted_attrs.iter_mut().zip(vertices.iter()) {
+        for (weighted_attr, (attr, attr_weight)) in weighted_attrs
+            .iter_mut()
+            .zip(vertex.attrs().iter().zip(attribute_weights.iter()))
+        {
+            *weighted_attr = attr * attr_weight;
+        }
+    }
+
+    vertex_weighted_attrs
+}
+
 union CollapseUnion {
     bidi: u32,
     error: f32,
@@ -427,6 +457,7 @@ struct Collapse {
 
 #[derive(Clone, Copy, Default, Debug)]
 struct Quadric {
+    // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c
     a00: f32,
     a11: f32,
     a22: f32,
@@ -457,24 +488,6 @@ impl AddAssign for Quadric {
 }
 
 impl Quadric {
-    #[cfg(feature = "experimental")]
-    pub fn from_point(x: f32, y: f32, z: f32, w: f32) -> Self {
-        // we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric
-        Self {
-            a00: w,
-            a11: w,
-            a22: w,
-            a10: 0.0,
-            a20: 0.0,
-            a21: 0.0,
-            b0: -2.0 * x * w,
-            b1: -2.0 * y * w,
-            b2: -2.0 * z * w,
-            c: (x * x + y * y + z * z) * w,
-            w,
-        }
-    }
-
     fn from_plane(a: f32, b: f32, c: f32, d: f32, w: f32) -> Self {
         let aw = a * w;
         let bw = b * w;
@@ -532,6 +545,97 @@ impl Quadric {
         Self::from_plane(normal.x, normal.y, normal.z, -distance, length * weight)
     }
 
+    fn from_attributes<const ATTR_COUNT: usize>(
+        g: &mut [QuadricGrad; ATTR_COUNT],
+        p0: &Vector3,
+        p1: &Vector3,
+        p2: &Vector3,
+        va0: &[f32; ATTR_COUNT],
+        va1: &[f32; ATTR_COUNT],
+        va2: &[f32; ATTR_COUNT],
+    ) -> Self {
+        // for each attribute we want to encode the following function into the quadric:
+        // (eval(pos) - attr)^2
+        // where eval(pos) interpolates attribute across the triangle like so:
+        // eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
+        // where gx/gy/gz/gw are gradients
+        let p10 = *p1 - *p0;
+        let p20 = *p2 - *p0;
+
+        // weight is scaled linearly with edge length
+        let normal = Vector3 {
+            x: p10.y * p20.z - p10.z * p20.y,
+            y: p10.z * p20.x - p10.x * p20.z,
+            z: p10.x * p20.y - p10.y * p20.x,
+        };
+        let area = normal.length();
+        let w = area.sqrt(); // TODO this needs more experimentation
+
+        // we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
+        // v = (d11 * d20 - d01 * d21) / denom
+        // w = (d00 * d21 - d01 * d20) / denom
+        // u = 1 - v - w
+        // here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+        let v0 = &p10;
+        let v1 = &p20;
+        let d00 = v0.length_squared();
+        let d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+        let d11 = v1.length_squared();
+        let denom = d00 * d11 - d01 * d01;
+        let denomr = zero_inverse(denom);
+
+        // precompute gradient factors
+        // these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+        let gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
+        let gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
+        let gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
+        let gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
+        let gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
+        let gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
+
+        let mut q = Quadric {
+            w,
+            ..Default::default()
+        };
+
+        for (k, gg) in g.iter_mut().enumerate() {
+            let a0 = va0[k];
+            let a1 = va1[k];
+            let a2 = va2[k];
+
+            // compute gradient of eval(pos) for x/y/z/w
+            // the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
+            let gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
+            let gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
+            let gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
+            let gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
+
+            // quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
+            // since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+            q.a00 += w * (gx * gx);
+            q.a11 += w * (gy * gy);
+            q.a22 += w * (gz * gz);
+
+            q.a10 += w * (gy * gx);
+            q.a20 += w * (gz * gx);
+            q.a21 += w * (gz * gy);
+
+            q.b0 += w * (gx * gw);
+            q.b1 += w * (gy * gw);
+            q.b2 += w * (gz * gw);
+
+            q.c += w * (gw * gw);
+
+            // the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
+            gg.gx = w * gx;
+            gg.gy = w * gy;
+            gg.gz = w * gz;
+            gg.gw = w * gw;
+        }
+
+        q
+    }
+
     pub fn error(&self, v: &Vector3) -> f32 {
         let mut rx = self.b0;
         let mut ry = self.b1;
@@ -558,6 +662,77 @@ impl Quadric {
 
         r.abs() * s
     }
+
+    pub fn error_grad<const ATTR_COUNT: usize>(
+        &self,
+        g: &[QuadricGrad; ATTR_COUNT],
+        v: &Vector3,
+        va: &[f32; ATTR_COUNT],
+    ) -> f32 {
+        let mut rx = self.b0;
+        let mut ry = self.b1;
+        let mut rz = self.b2;
+
+        rx += self.a10 * v.y;
+        ry += self.a21 * v.z;
+        rz += self.a20 * v.x;
+
+        rx *= 2.0;
+        ry *= 2.0;
+        rz *= 2.0;
+
+        rx += self.a00 * v.x;
+        ry += self.a11 * v.y;
+        rz += self.a22 * v.z;
+
+        let mut r = self.c;
+        r += rx * v.x;
+        r += ry * v.y;
+        r += rz * v.z;
+
+        // see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
+        for (a, gg) in va.iter().zip(g.iter()) {
+            let g = v.x * gg.gx + v.y * gg.gy + v.z * gg.gz + gg.gw;
+
+            r += a * a * self.w;
+            r -= 2.0 * a * g;
+        }
+
+        // TODO: weight normalization is breaking attribute error somehow
+        let s = 1.0; // q.w == zero_inverse(q.w);
+
+        r.abs() * s
+    }
+}
+
+#[derive(Debug, Default, Clone, Copy)]
+struct QuadricGrad {
+    // gx*x + gy*y + gz*z + gw
+    gx: f32,
+    gy: f32,
+    gz: f32,
+    gw: f32,
+}
+
+impl AddAssign for QuadricGrad {
+    fn add_assign(&mut self, other: Self) {
+        self.gx += other.gx;
+        self.gy += other.gy;
+        self.gz += other.gz;
+        self.gw += other.gw;
+    }
+}
+
+#[cfg(feature = "experimental")]
+#[derive(Default, Debug, Clone, Copy)]
+struct Reservoir {
+    x: f32,
+    y: f32,
+    z: f32,
+    r: f32,
+    g: f32,
+    b: f32,
+    w: f32,
 }
 
 fn fill_face_quadrics(vertex_quadrics: &mut [Quadric], indices: &[u32], vertex_positions: &[Vector3], remap: &[u32]) {
@@ -582,7 +757,7 @@ fn fill_edge_quadrics(
     loopback: &[u32],
 ) {
     for i in indices.chunks_exact(3) {
-        const NEXT: [usize; 3] = [1, 2, 0];
+        const NEXT: [usize; 4] = [1, 2, 0, 1];
 
         for e in 0..3 {
             let i0 = i[e] as usize;
@@ -612,7 +787,7 @@ fn fill_edge_quadrics(
                 continue;
             }
 
-            let i2 = i[NEXT[NEXT[e]]] as usize;
+            let i2 = i[NEXT[e + 1]] as usize;
 
             // we try hard to maintain border edge geometry; seam edges can move more freely
             // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
@@ -638,6 +813,46 @@ fn fill_edge_quadrics(
     }
 }
 
+fn add_grads<const ATTR_COUNT: usize>(g: &mut [QuadricGrad; ATTR_COUNT], r: &[QuadricGrad; ATTR_COUNT]) {
+    for (gg, rr) in g.iter_mut().zip(r.iter()) {
+        *gg += *rr;
+    }
+}
+
+fn fill_attribute_quadrics<const ATTR_COUNT: usize>(
+    attribute_quadrics: &mut [Quadric],
+    attribute_gradients: &mut [[QuadricGrad; ATTR_COUNT]],
+    indices: &[u32],
+    vertex_positions: &[Vector3],
+    vertex_attributes: &[[f32; ATTR_COUNT]],
+    remap: &[u32],
+) {
+    for i in indices.as_chunks::<3>().0 {
+        let [i0, i1, i2] = i;
+        let [i0, i1, i2] = [*i0 as usize, *i1 as usize, *i2 as usize];
+
+        let mut g = [QuadricGrad::default(); ATTR_COUNT];
+        let qa = Quadric::from_attributes(
+            &mut g,
+            &vertex_positions[i0],
+            &vertex_positions[i1],
+            &vertex_positions[i2],
+            &vertex_attributes[i0],
+            &vertex_attributes[i1],
+            &vertex_attributes[i2],
+        );
+
+        // TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea
+        attribute_quadrics[remap[i0] as usize] += qa;
+        attribute_quadrics[remap[i1] as usize] += qa;
+        attribute_quadrics[remap[i2] as usize] += qa;
+
+        add_grads(&mut attribute_gradients[remap[i0] as usize], &g);
+        add_grads(&mut attribute_gradients[remap[i1] as usize], &g);
+        add_grads(&mut attribute_gradients[remap[i2] as usize], &g);
+    }
+}
+
 // does triangle ABC flip when C is replaced with D?
 fn has_triangle_flip(a: &Vector3, b: &Vector3, c: &Vector3, d: &Vector3) -> bool {
     let eb = Vector3::new(b.x - a.x, b.y - a.y, b.z - a.z);
@@ -655,7 +870,7 @@ fn has_triangle_flip(a: &Vector3, b: &Vector3, c: &Vector3, d: &Vector3) -> bool
         eb.x * ed.y - eb.y * ed.x,
     );
 
-    nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z < 0.0
+    nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z <= 0.0
 }
 
 fn has_triangle_flips(
@@ -671,17 +886,16 @@ fn has_triangle_flips(
     let v0 = vertex_positions[i0];
     let v1 = vertex_positions[i1];
 
-    let offset = adjacency.offsets[i0] as usize;
-    let count = adjacency.counts[i0] as usize;
-    let edges = &adjacency.data[offset..offset + count];
+    let start = adjacency.offsets[i0] as usize;
+    let end = adjacency.offsets[i0 + 1] as usize;
+    let edges = &adjacency.data[start..end];
 
     for edge in edges {
         let a = collapse_remap[edge.next as usize] as usize;
         let b = collapse_remap[edge.prev as usize] as usize;
 
-        // skip triangles that get collapsed
-        // note: this is mathematically redundant as if either of these is true, the dot product in has_triangle_flip should be 0
-        if a == i1 || b == i1 {
+        // skip triangles that will get collapsed by i0->i1 collapse or already got collapsed previously
+        if a == i1 || b == i1 || a == b {
             continue;
         }
 
@@ -694,8 +908,28 @@ fn has_triangle_flips(
     false
 }
 
+fn bound_edge_collapses(adjacency: &EdgeAdjacency, index_count: usize, vertex_kind: &[VertexKind]) -> usize {
+    let mut dual_count = 0;
+
+    for (k, w) in vertex_kind.iter().zip(adjacency.offsets.windows(2)) {
+        let c = w[1] - w[0];
+
+        dual_count += if *k == VertexKind::Manifold || *k == VertexKind::Seam {
+            c as usize
+        } else {
+            0
+        };
+    }
+
+    assert!(dual_count <= index_count);
+
+    // pad capacity by 3 so that we can check for overflow once per triangle instead of once per edge
+    (index_count - dual_count / 2) + 3
+}
+
 fn pick_edge_collapses(
     collapses: &mut [Collapse],
+    collapse_capacity: usize,
     indices: &[u32],
     remap: &[u32],
     vertex_kind: &[VertexKind],
@@ -710,6 +944,11 @@ fn pick_edge_collapses(
             let i0 = i[e] as usize;
             let i1 = i[NEXT[e]] as usize;
 
+            // this should never happen as boundEdgeCollapses should give an upper bound for the collapse count, but in an unlikely event it does we can just drop extra collapses
+            if collapse_count + 3 > collapse_capacity {
+                break;
+            }
+
             // this can happen either when input has a zero-length edge, or when we perform collapses for complex
             // topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
             // we leave edges like this alone since they may be important for preserving mesh integrity
@@ -766,10 +1005,13 @@ fn pick_edge_collapses(
     collapse_count
 }
 
-fn rank_edge_collapses(
+fn rank_edge_collapses<const ATTR_COUNT: usize>(
     collapses: &mut [Collapse],
     vertex_positions: &[Vector3],
+    vertex_attributes: &[[f32; ATTR_COUNT]],
     vertex_quadrics: &[Quadric],
+    attribute_quadrics: &[Quadric],
+    attribute_gradients: &[[QuadricGrad; ATTR_COUNT]],
     remap: &[u32],
 ) {
     for c in collapses {
@@ -781,77 +1023,35 @@ fn rank_edge_collapses(
         let j0 = unsafe { if c.u.bidi != 0 { i1 } else { i0 } };
         let j1 = unsafe { if c.u.bidi != 0 { i0 } else { i1 } };
 
-        let qi = vertex_quadrics[remap[i0 as usize] as usize];
-        let qj = vertex_quadrics[remap[j0 as usize] as usize];
-
-        let ei = qi.error(&vertex_positions[i1 as usize]);
-        let ej = qj.error(&vertex_positions[j1 as usize]);
-
-        // pick edge direction with minimal error
-        c.v0 = if ei <= ej { i0 } else { j0 };
-        c.v1 = if ei <= ej { i1 } else { j1 };
-        c.u.error = ei.min(ej);
-    }
-}
-
-#[cfg(feature = "trace")]
-fn dump_edge_collapses(collapses: &[Collapse], vertex_kind: &[VertexKind]) {
-    let mut ckinds = [[0usize; KIND_COUNT]; KIND_COUNT];
-    let mut cerrors = [[f32::MAX; KIND_COUNT]; KIND_COUNT];
-
-    for c in collapses {
-        let i0 = c.v0;
-        let i1 = c.v1;
+        let ri0 = remap[i0 as usize] as usize;
+        let rj0 = remap[j0 as usize] as usize;
 
-        let k0 = vertex_kind[i0 as usize] as usize;
-        let k1 = vertex_kind[i1 as usize] as usize;
-
-        ckinds[k0][k1] += 1;
-        cerrors[k0][k1] = cerrors[k0][k1].min(unsafe { c.u.error });
-    }
+        let qi = vertex_quadrics[ri0];
+        let qj = vertex_quadrics[rj0];
 
-    for k0 in 0..KIND_COUNT {
-        for k1 in 0..KIND_COUNT {
-            if ckinds[k0][k1] != 0 {
-                println!(
-                    "collapses {k0} -> {k1}: {}, min error {:e}",
-                    ckinds[k0][k1],
-                    if ckinds[k0][k1] != 0 {
-                        cerrors[k0][k1].sqrt()
-                    } else {
-                        0.0
-                    }
-                );
-            }
-        }
-    }
-}
-
-#[cfg(feature = "trace")]
-fn dump_locked_collapses(indices: &[u32], vertex_kind: &[VertexKind]) {
-    let mut locked_collapses = [[0usize; KIND_COUNT]; KIND_COUNT];
-
-    for i in indices.chunks_exact(3) {
-        const NEXT: [usize; 3] = [1, 2, 0];
-
-        for e in 0..3 {
-            let i0 = i[e] as usize;
-            let i1 = i[NEXT[e]] as usize;
+        let mut ei = qi.error(&vertex_positions[i1 as usize]);
+        let mut ej = qj.error(&vertex_positions[j1 as usize]);
 
-            let k0 = vertex_kind[i0].index();
-            let k1 = vertex_kind[i1].index();
+        if ATTR_COUNT > 0 {
+            let agi = attribute_quadrics[ri0];
+            let agj = attribute_quadrics[rj0];
 
-            locked_collapses[k0][k1] += (!CAN_COLLAPSE[k0][k1] && !CAN_COLLAPSE[k1][k0]) as usize;
+            ei += agi.error_grad(
+                &attribute_gradients[ri0],
+                &vertex_positions[i1 as usize],
+                &vertex_attributes[i1 as usize],
+            );
+            ej += agj.error_grad(
+                &attribute_gradients[rj0],
+                &vertex_positions[j1 as usize],
+                &vertex_attributes[j1 as usize],
+            );
         }
-    }
 
-    #[allow(clippy::needless_range_loop)]
-    for k0 in 0..KIND_COUNT {
-        for k1 in 0..KIND_COUNT {
-            if locked_collapses[k0][k1] != 0 {
-                println!("locked collapses {k0} -> {k1}: {}", locked_collapses[k0][k1]);
-            }
-        }
+        // pick edge direction with minimal error
+        c.v0 = if ei <= ej { i0 } else { j0 };
+        c.v1 = if ei <= ej { i1 } else { j1 };
+        c.u.error = ei.min(ej);
     }
 }
 
@@ -890,10 +1090,12 @@ fn sort_edge_collapses(sort_order: &mut [u32], collapses: &[Collapse]) {
 }
 
 #[allow(clippy::too_many_arguments)]
-fn perform_edge_collapses(
+fn perform_edge_collapses<const ATTR_COUNT: usize>(
     collapse_remap: &mut [u32],
     collapse_locked: &mut [bool],
     vertex_quadrics: &mut [Quadric],
+    attribute_quadrics: &mut [Quadric],
+    attribute_gradients: &mut [[QuadricGrad; ATTR_COUNT]],
     collapses: &[Collapse],
     collapse_order: &[u32],
     remap: &[u32],
@@ -963,6 +1165,13 @@ fn perform_edge_collapses(
 
         vertex_quadrics[r1] += vertex_quadrics[r0];
 
+        if ATTR_COUNT > 0 {
+            attribute_quadrics[r1] += attribute_quadrics[r0];
+
+            let copy = attribute_gradients[r0];
+            add_grads(&mut attribute_gradients[r1], &copy);
+        }
+
         match vertex_kind[i0] {
             VertexKind::Complex => {
                 let mut v = i0;
@@ -1167,11 +1376,37 @@ mod experimental {
         }
     }
 
-    pub fn fill_cell_quadrics2(cell_quadrics: &mut [Quadric], vertex_positions: &[Vector3], vertex_cells: &[u32]) {
-        for (c, v) in vertex_cells.iter().zip(vertex_positions.iter()) {
-            let q = Quadric::from_point(v.x, v.y, v.z, 1.0);
+    pub fn fill_cell_reservoirs<V>(
+        cell_reservoirs: &mut [Reservoir],
+        vertex_positions: &[Vector3],
+        vertices: &[V],
+        vertex_cells: &[u32],
+    ) where
+        V: Vertex,
+    {
+        for (cell, (vp, v)) in vertex_cells.iter().zip(vertex_positions.iter().zip(vertices.iter())) {
+            let r = &mut cell_reservoirs[*cell as usize];
+
+            let color = if V::HAS_COLORS { v.colors() } else { [0f32; 3] };
+
+            r.x += vp.x;
+            r.y += vp.y;
+            r.z += vp.z;
+            r.r += color[0];
+            r.g += color[1];
+            r.b += color[2];
+            r.w += 1.0;
+        }
+
+        for r in cell_reservoirs {
+            let iw = zero_inverse(r.w);
 
-            cell_quadrics[*c as usize] += q;
+            r.x *= iw;
+            r.y *= iw;
+            r.z *= iw;
+            r.r *= iw;
+            r.g *= iw;
+            r.b *= iw;
         }
     }
 
@@ -1193,6 +1428,40 @@ mod experimental {
         }
     }
 
+    pub fn fill_cell_remap2<V>(
+        cell_remap: &mut [u32],
+        cell_errors: &mut [f32],
+        vertex_cells: &[u32],
+        cell_reservoirs: &[Reservoir],
+        vertex_positions: &[Vector3],
+        vertices: &[V],
+        color_weight: f32,
+    ) where
+        V: Vertex,
+    {
+        for ((i, c), (vp, v)) in vertex_cells
+            .iter()
+            .enumerate()
+            .zip(vertex_positions.iter().zip(vertices.iter()))
+        {
+            let cell = *c as usize;
+            let r = &cell_reservoirs[cell];
+
+            let color = if V::HAS_COLORS { v.colors() } else { [0f32; 3] };
+
+            let pos_error = (vp.x - r.x) * (vp.x - r.x) + (vp.y - r.y) * (vp.y - r.y) + (vp.z - r.z) * (vp.z - r.z);
+            let col_error = (color[0] - r.r) * (color[0] - r.r)
+                + (color[1] - r.g) * (color[1] - r.g)
+                + (color[2] - r.b) * (color[2] - r.b);
+            let error = pos_error + color_weight * col_error;
+
+            if cell_remap[cell] == INVALID_INDEX || cell_errors[cell] > error {
+                cell_remap[cell] = i as u32;
+                cell_errors[cell] = error;
+            }
+        }
+    }
+
     pub fn filter_triangles(
         destination: &mut [u32],
         tritable: &mut HashMap<hash::VertexPosition, u32, BuildNoopHasher>,
@@ -1270,18 +1539,86 @@ bitflags! {
 /// * `destination`: must contain enough space for the target index buffer, worst case is `indices.len()` elements (**not** `target_index_count`)!
 /// * `target_error`: represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
 /// * `result_error`: can be None; when it's not None, it will contain the resulting (relative) error after simplification
-pub fn simplify<Vertex>(
+pub fn simplify<V>(
+    destination: &mut [u32],
+    indices: &[u32],
+    vertices: &[V],
+    target_index_count: usize,
+    target_error: f32,
+    options: SimplificationOptions,
+    result_error: Option<&mut f32>,
+) -> usize
+where
+    V: Vertex,
+{
+    simplify_edge::<V, 0>(
+        destination,
+        indices,
+        vertices,
+        &[],
+        target_index_count,
+        target_error,
+        options,
+        result_error,
+    )
+}
+
+/// Mesh simplifier with attribute metric
+///
+/// The algorithm enhances [`simplify`] by incorporating attribute values into the error metric used to prioritize simplification order; see [`simplify`] documentation for details.
+/// Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to [`simplify`] when using 4 scalar attributes.
+///
+/// # Arguments
+///
+/// * `vertex_attributes`: should have attribute_count floats for each vertex
+/// * `attribute_weights`: should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
+///
+/// TODO `target_error`/`result_error` currently use combined distance+attribute error; this may change in the future
+#[cfg(feature = "experimental")]
+#[allow(clippy::too_many_arguments)]
+pub fn simplify_with_attributes<V, const ATTR_COUNT: usize>(
+    destination: &mut [u32],
+    indices: &[u32],
+    vertices: &[V],
+    attribute_weights: &[f32; ATTR_COUNT],
+    target_index_count: usize,
+    target_error: f32,
+    options: SimplificationOptions,
+    result_error: Option<&mut f32>,
+) -> usize
+where
+    V: Vertex<ATTR_COUNT>,
+{
+    simplify_edge(
+        destination,
+        indices,
+        vertices,
+        attribute_weights,
+        target_index_count,
+        target_error,
+        options,
+        result_error,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn simplify_edge<V, const ATTR_COUNT: usize>(
     destination: &mut [u32],
     indices: &[u32],
-    vertices: &[Vertex],
+    vertices: &[V],
+    attribute_weights: &[f32; ATTR_COUNT],
     target_index_count: usize,
     target_error: f32,
     options: SimplificationOptions,
     result_error: Option<&mut f32>,
 ) -> usize
 where
-    Vertex: Position,
+    V: Vertex<ATTR_COUNT>,
 {
+    const {
+        assert!(ATTR_COUNT < MAX_ATTRIBUTES);
+    }
+
     assert_eq!(indices.len() % 3, 0);
     assert!(target_index_count <= indices.len());
 
@@ -1342,7 +1679,23 @@ where
     let mut vertex_positions = vec![Vector3::default(); vertices.len()]; // TODO: spare init?
     rescale_positions(&mut vertex_positions, vertices);
 
+    let vertex_attributes = if ATTR_COUNT > 0 {
+        rescale_attributes(vertices, attribute_weights)
+    } else {
+        Vec::new()
+    };
+
     let mut vertex_quadrics = vec![Quadric::default(); vertices.len()];
+
+    let (mut attribute_quadrics, mut attribute_gradients) = if ATTR_COUNT > 0 {
+        (
+            vec![Quadric::default(); vertices.len()],
+            vec![[QuadricGrad::default(); ATTR_COUNT]; vertices.len()],
+        )
+    } else {
+        (Vec::new(), Vec::new())
+    };
+
     fill_face_quadrics(&mut vertex_quadrics, indices, &vertex_positions, &remap);
     fill_edge_quadrics(
         &mut vertex_quadrics,
@@ -1354,11 +1707,23 @@ where
         &loopback,
     );
 
+    if ATTR_COUNT > 0 {
+        fill_attribute_quadrics::<ATTR_COUNT>(
+            &mut attribute_quadrics,
+            &mut attribute_gradients,
+            indices,
+            &vertex_positions,
+            &vertex_attributes,
+            &remap,
+        );
+    }
+
     result.copy_from_slice(indices);
 
+    let collapse_capacity = bound_edge_collapses(&adjacency, indices.len(), &vertex_kind);
     // TODO: skip init?
-    let mut edge_collapses = vec![Collapse::default(); indices.len()];
-    let mut collapse_order = vec![0u32; indices.len()];
+    let mut edge_collapses = vec![Collapse::default(); collapse_capacity];
+    let mut collapse_order = vec![0u32; collapse_capacity];
     let mut collapse_remap = vec![0u32; vertices.len()];
     let mut collapse_locked = vec![false; vertices.len()];
 
@@ -1377,11 +1742,13 @@ where
 
         let edge_collapse_count = pick_edge_collapses(
             &mut edge_collapses,
+            collapse_capacity,
             &result[0..result_count],
             &remap,
             &vertex_kind,
             &loop_,
         );
+        assert!(edge_collapse_count <= collapse_capacity);
 
         // no edges can be collapsed any more due to topology restrictions
         if edge_collapse_count == 0 {
@@ -1391,13 +1758,13 @@ where
         rank_edge_collapses(
             &mut edge_collapses[0..edge_collapse_count],
             &vertex_positions,
+            &vertex_attributes,
             &vertex_quadrics,
+            &attribute_quadrics,
+            &attribute_gradients,
             &remap,
         );
 
-        #[cfg(feature = "trace")]
-        dump_edge_collapses(&edge_collapses[0..edge_collapse_count], &vertex_kind);
-
         sort_edge_collapses(&mut collapse_order, &edge_collapses[0..edge_collapse_count]);
 
         let triangle_collapse_goal = (result_count - target_index_count) / 3;
@@ -1418,6 +1785,8 @@ where
             &mut collapse_remap,
             &mut collapse_locked,
             &mut vertex_quadrics,
+            &mut attribute_quadrics,
+            &mut attribute_gradients,
             &edge_collapses,
             &collapse_order,
             &remap,
@@ -1450,9 +1819,6 @@ where
         result_error_max.sqrt()
     );
 
-    #[cfg(feature = "trace")]
-    dump_locked_collapses(result, &vertex_kind);
-
     // result_error is quadratic; we need to remap it back to linear
     if let Some(result_error) = result_error {
         *result_error = result_error_max.sqrt();
@@ -1475,16 +1841,16 @@ where
 /// * `target_error`: represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
 /// * `result_error`: can be None; when it's not None, it will contain the resulting (relative) error after simplification
 #[cfg(feature = "experimental")]
-pub fn simplify_sloppy<Vertex>(
+pub fn simplify_sloppy<V>(
     destination: &mut [u32],
     indices: &[u32],
-    vertices: &[Vertex],
+    vertices: &[V],
     target_index_count: usize,
     target_error: f32,
     result_error: Option<&mut f32>,
 ) -> usize
 where
-    Vertex: Position,
+    V: Vertex,
 {
     use experimental::*;
 
@@ -1649,9 +2015,14 @@ where
 ///
 /// * `destination`: must contain enough space for the target index buffer (`target_vertex_count` elements)
 #[cfg(feature = "experimental")]
-pub fn simplify_points<Vertex>(destination: &mut [u32], vertices: &[Vertex], target_vertex_count: usize) -> usize
+pub fn simplify_points<V>(
+    destination: &mut [u32],
+    vertices: &[V],
+    target_vertex_count: usize,
+    color_weight: f32,
+) -> usize
 where
-    Vertex: Position,
+    V: Vertex,
 {
     use experimental::*;
 
@@ -1762,21 +2133,23 @@ where
     table.clear();
     let cell_count = fill_vertex_cells(&mut table, &mut vertex_cells, &vertex_ids);
 
-    // build a quadric for each target cell
-    let mut cell_quadrics = vec![Quadric::default(); cell_count];
+    // accumulate points into a reservoir for each target cell
+    let mut cell_reservoirs = vec![Reservoir::default(); cell_count];
 
-    fill_cell_quadrics2(&mut cell_quadrics, &vertex_positions, &vertex_cells);
+    fill_cell_reservoirs(&mut cell_reservoirs, &vertex_positions, vertices, &vertex_cells);
 
     // for each target cell, find the vertex with the minimal error
     let mut cell_remap = vec![INVALID_INDEX; cell_count];
     let mut cell_errors = vec![0.0; cell_count];
 
-    fill_cell_remap(
+    fill_cell_remap2(
         &mut cell_remap,
         &mut cell_errors,
         &vertex_cells,
-        &cell_quadrics,
+        &cell_reservoirs,
         &vertex_positions,
+        vertices,
+        color_weight,
     );
 
     // copy results to the output
@@ -1784,7 +2157,16 @@ where
     destination[0..cell_count].copy_from_slice(&cell_remap);
 
     #[cfg(feature = "trace")]
-    println!("result: {cell_count} cells");
+    {
+        // compute error
+        let mut result_error = 0.0f32;
+
+        for cell_error in &cell_errors {
+            result_error = result_error.max(*cell_error);
+        }
+
+        println!("result: {} cells, {} error", cell_errors.len(), result_error.sqrt());
+    }
 
     cell_count
 }
@@ -1793,9 +2175,9 @@ where
 ///
 /// Absolute error must be **divided** by the scaling factor before passing it to [simplify] as `target_error`.
 /// Relative error returned by [simplify] via `result_error` must be **multiplied** by the scaling factor to get absolute error.
-pub fn simplify_scale<Vertex>(vertices: &[Vertex]) -> f32
+pub fn simplify_scale<V>(vertices: &[V]) -> f32
 where
-    Vertex: Position,
+    V: Vertex,
 {
     let (_minv, extent) = calc_pos_extents(vertices);
 
@@ -1806,22 +2188,22 @@ where
 mod test {
     use super::*;
 
-    struct Vertex {
+    struct TestVertex {
         x: f32,
         y: f32,
         z: f32,
     }
 
-    impl Position for Vertex {
+    impl Vertex for TestVertex {
         fn pos(&self) -> [f32; 3] {
             [self.x, self.y, self.z]
         }
     }
 
-    fn vb_from_slice(slice: &[f32]) -> Vec<Vertex> {
+    fn vb_from_slice(slice: &[f32]) -> Vec<TestVertex> {
         slice
             .chunks_exact(3)
-            .map(|v| Vertex {
+            .map(|v| TestVertex {
                 x: v[0],
                 y: v[1],
                 z: v[2],
@@ -1829,6 +2211,49 @@ mod test {
             .collect()
     }
 
+    #[test]
+    fn test_simplify() {
+        // 0
+        // 1 2
+        // 3 4 5
+        #[rustfmt::skip]
+        let ib = [
+            0, 2, 1,
+            1, 2, 3,
+            3, 2, 4,
+            2, 5, 4,
+        ];
+
+        #[rustfmt::skip]
+        let vb = vb_from_slice(&[
+            0.0, 4.0, 0.0,
+            0.0, 1.0, 0.0,
+            2.0, 2.0, 0.0,
+            0.0, 0.0, 0.0,
+            1.0, 0.0, 0.0,
+            4.0, 0.0, 0.0,
+        ]);
+
+        let expected = [0, 5, 3];
+
+        let mut error = 1.0;
+        let mut dst = vec![0; ib.len()];
+        assert_eq!(
+            simplify(
+                &mut dst,
+                &ib,
+                &vb,
+                3,
+                1e-2,
+                SimplificationOptions::empty(),
+                Some(&mut error)
+            ),
+            3
+        );
+        assert_eq!(error, 0.0);
+        assert_eq!(&dst[0..expected.len()], expected);
+    }
+
     #[test]
     fn test_simplify_stuck() {
         let mut dst = vec![0; 16];
@@ -1902,7 +2327,7 @@ mod test {
         let vb = vb_from_slice(&[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
 
         // simplifying down to 0 points results in 0 immediately
-        assert_eq!(simplify_points(&mut dst, &vb, 0), 0);
+        assert_eq!(simplify_points(&mut dst, &vb, 0, 0.0), 0);
     }
 
     #[test]
@@ -1913,41 +2338,41 @@ mod test {
         // the wrong collapse is picked instead.
         #[rustfmt::skip]
         let vb = vb_from_slice(&[
-            1.000000, 1.000000, -1.000000, 
-            1.000000, 1.000000, 1.000000, 
-            1.000000, -1.000000, 1.000000, 
-            1.000000, -0.200000, -0.200000, 
-            1.000000, 0.200000, -0.200000, 
-            1.000000, -0.200000, 0.200000, 
-            1.000000, 0.200000, 0.200000, 
-            1.000000, 0.500000, -0.500000, 
+            1.000000, 1.000000, -1.000000,
+            1.000000, 1.000000, 1.000000,
+            1.000000, -1.000000, 1.000000,
+            1.000000, -0.200000, -0.200000,
+            1.000000, 0.200000, -0.200000,
+            1.000000, -0.200000, 0.200000,
+            1.000000, 0.200000, 0.200000,
+            1.000000, 0.500000, -0.500000,
             1.000000, -1.000000, 0.000000,
         ]);
 
         // the collapse we expect is 7 -> 0
         #[rustfmt::skip]
         let ib = [
-            7, 4, 3, 
-            1, 2, 5, 
-            7, 1, 6, 
+            7, 4, 3,
+            1, 2, 5,
+            7, 1, 6,
             7, 8, 0, // gets removed
-            7, 6, 4, 
-            8, 5, 2, 
-            8, 7, 3, 
-            8, 3, 5, 
-            5, 6, 1, 
+            7, 6, 4,
+            8, 5, 2,
+            8, 7, 3,
+            8, 3, 5,
+            5, 6, 1,
             7, 0, 1, // gets removed
         ];
 
         #[rustfmt::skip]
         let expected = [
-            0, 4, 3, 
-            1, 2, 5, 
-            0, 1, 6, 
-            0, 6, 4, 
-            8, 5, 2, 
-            8, 0, 3, 
-            8, 3, 5, 
+            0, 4, 3,
+            1, 2, 5,
+            0, 1, 6,
+            0, 6, 4,
+            8, 5, 2,
+            8, 0, 3,
+            8, 3, 5,
             5, 6, 1,
         ];
 
@@ -1976,7 +2401,7 @@ mod test {
             0.000000, 2.000000, 0.000000,
             1.000000, 0.000000, 0.000000,
             2.000000, 0.000000, 0.000000,
-            1.000000, 1.000000, 0.000000, 
+            1.000000, 1.000000, 0.000000,
         ]);
 
         // 0 1 2
@@ -2065,4 +2490,80 @@ mod test {
         );
         assert_eq!(&dst[0..expected.len()], expected);
     }
+
+    #[test]
+    #[cfg(feature = "experimental")]
+    fn test_simplify_attr() {
+        #[derive(Default, Clone, Copy)]
+        struct TestVertexWithAttributes([[f32; 3]; 2]);
+
+        impl Vertex<3> for TestVertexWithAttributes {
+            fn pos(&self) -> [f32; 3] {
+                self.0[0]
+            }
+
+            fn attrs(&self) -> [f32; 3] {
+                self.0[1]
+            }
+        }
+
+        let mut vb = [TestVertexWithAttributes::default(); 8 * 3];
+
+        for y in 0..8 {
+            // first four rows are a blue gradient, next four rows are a yellow gradient
+            let r = if y < 4 { 0.8 + y as f32 * 0.05 } else { 0.0 };
+            let g = if y < 4 { 0.8 + y as f32 * 0.05 } else { 0.0 };
+            let b = if y < 4 { 0.0 } else { 0.8 + (7 - y) as f32 * 0.05 };
+
+            for x in 0..3 {
+                let v = &mut vb[y * 3 + x].0;
+                v[0][0] = x as f32;
+                v[0][1] = y as f32;
+                v[0][2] = 0.03 * x as f32;
+                v[1][0] = r;
+                v[1][1] = g;
+                v[1][2] = b;
+            }
+        }
+
+        let mut ib = [[0u32; 6]; 7 * 2];
+
+        for y in 0..7 {
+            for x in 0..2 {
+                ib[y * 2 + x][0] = ((y + 0) * 3 + (x + 0)) as u32;
+                ib[y * 2 + x][1] = ((y + 0) * 3 + (x + 1)) as u32;
+                ib[y * 2 + x][2] = ((y + 1) * 3 + (x + 0)) as u32;
+                ib[y * 2 + x][3] = ((y + 1) * 3 + (x + 0)) as u32;
+                ib[y * 2 + x][4] = ((y + 0) * 3 + (x + 1)) as u32;
+                ib[y * 2 + x][5] = ((y + 1) * 3 + (x + 1)) as u32;
+            }
+        }
+
+        let ib = ib.iter().flatten().copied().collect::<Vec<_>>();
+
+        let attr_weights = [0.01, 0.01, 0.01];
+
+        let expected = [
+            [0, 2, 9, 9, 2, 11],
+            [9, 11, 12, 12, 11, 14],
+            [12, 14, 21, 21, 14, 23],
+        ];
+
+        let mut actual = vec![0u32; ib.len()];
+
+        assert_eq!(
+            simplify_with_attributes::<TestVertexWithAttributes, 3>(
+                &mut actual,
+                &ib,
+                &vb,
+                &attr_weights,
+                6 * 3,
+                1e-2,
+                SimplificationOptions::empty(),
+                None
+            ),
+            18
+        );
+        assert!(actual.iter().zip(expected.iter().flatten()).all(|(a, b)| a == b));
+    }
 }
diff --git a/src/spatial_order.rs b/src/spatial_order.rs
index 8e9b649..8b9670e 100644
--- a/src/spatial_order.rs
+++ b/src/spatial_order.rs
@@ -1,8 +1,9 @@
-//! **Experimental** spatial sorting
+//! Spatial sorting
 
+#[cfg(feature = "experimental")]
 use crate::Vector3;
 use crate::util::zero_inverse;
-use crate::vertex::{Position, calc_pos_extents};
+use crate::vertex::{Vertex, calc_pos_extents};
 
 // "Insert" two 0 bits after each of the 10 low bits of x
 #[inline(always)]
@@ -15,9 +16,9 @@ fn part_1_by_2(mut x: u32) -> u32 {
     x
 }
 
-fn compute_order<Vertex>(result: &mut [u32], vertices: &[Vertex])
+fn compute_order<V>(result: &mut [u32], vertices: &[V])
 where
-    Vertex: Position,
+    V: Vertex,
 {
     let (minv, extent) = calc_pos_extents(vertices);
 
@@ -80,9 +81,9 @@ fn radix_pass(destination: &mut [u32], source: &[u32], keys: &[u32], hist: &mut
 /// # Arguments
 ///
 /// * `destination`: must contain enough space for the resulting remap table (`vertices.len()` elements)
-pub fn spatial_sort_remap<Vertex>(destination: &mut [u32], vertices: &[Vertex])
+pub fn spatial_sort_remap<V>(destination: &mut [u32], vertices: &[V])
 where
-    Vertex: Position,
+    V: Vertex,
 {
     let mut keys = vec![0; vertices.len()];
     compute_order(&mut keys, vertices);
@@ -114,9 +115,10 @@ where
 /// # Arguments
 ///
 /// * `destination`: must contain enough space for the resulting index buffer (`indices.len()` elements)
-pub fn spatial_sort_triangles<Vertex>(destination: &mut [u32], indices: &[u32], vertices: &[Vertex])
+#[cfg(feature = "experimental")]
+pub fn spatial_sort_triangles<V>(destination: &mut [u32], indices: &[u32], vertices: &[V])
 where
-    Vertex: Position,
+    V: Vertex,
 {
     assert!(indices.len().is_multiple_of(3));
 
diff --git a/src/vertex/cache.rs b/src/vertex/cache.rs
index d64cba8..45a651e 100644
--- a/src/vertex/cache.rs
+++ b/src/vertex/cache.rs
@@ -251,8 +251,8 @@ fn optimize_vertex_cache_table(
             .sum();
     }
 
-    let mut cache_holder = [0; 2 * (CACHE_SIZE_MAX + 3)];
-    let (mut cache, mut cache_new) = cache_holder.split_at_mut(CACHE_SIZE_MAX + 3);
+    let mut cache_holder = [0; 2 * (CACHE_SIZE_MAX + 4)];
+    let (mut cache, mut cache_new) = cache_holder.split_at_mut(CACHE_SIZE_MAX + 4);
     let mut cache_count = 0;
 
     let mut current_triangle = 0;
@@ -283,10 +283,8 @@ fn optimize_vertex_cache_table(
 
         // old triangles
         for index in &cache[0..cache_count] {
-            if abc.iter().all(|e| *e != *index) {
-                cache_new[cache_write] = *index;
-                cache_write += 1;
-            }
+            cache_new[cache_write] = *index;
+            cache_write += abc.iter().all(|e| *e != *index) as usize;
         }
 
         std::mem::swap(&mut cache, &mut cache_new);
@@ -317,10 +315,15 @@ fn optimize_vertex_cache_table(
         }
 
         let mut best_triangle = INVALID_INDEX;
-        let mut best_score = 0.0;
+        let mut best_score = 0.0f32;
 
         // update cache positions, vertex scores and triangle scores, and find next best triangle
         for (i, index) in cache.iter().map(|index| *index as usize).enumerate().take(cache_write) {
+            // no need to update scores if we are never going to use this vertex
+            if adjacency.counts[index] == 0 {
+                continue;
+            }
+
             let cache_position = if i >= cache_size { -1 } else { i as i32 };
 
             // update vertex score
@@ -341,8 +344,8 @@ fn optimize_vertex_cache_table(
 
                 if best_score < tri_score {
                     best_triangle = *tri;
-                    best_score = tri_score;
                 }
+                best_score = best_score.max(tri_score);
 
                 triangle_scores[*tri as usize] = tri_score;
             }
diff --git a/src/vertex/fetch.rs b/src/vertex/fetch.rs
index 2f4e14f..c9b8fd7 100644
--- a/src/vertex/fetch.rs
+++ b/src/vertex/fetch.rs
@@ -2,7 +2,7 @@
 
 use crate::INVALID_INDEX;
 
-use super::Position;
+use super::Vertex;
 
 #[derive(Default)]
 pub struct VertexFetchStatistics {
@@ -102,9 +102,9 @@ pub fn optimize_vertex_fetch_remap(destination: &mut [u32], indices: &[u32]) ->
 /// # Arguments
 ///
 /// * `destination`: must contain enough space for the resulting vertex buffer (`vertices.len()` elements)
-pub fn optimize_vertex_fetch<Vertex>(destination: &mut [Vertex], indices: &mut [u32], vertices: &[Vertex]) -> usize
+pub fn optimize_vertex_fetch<V>(destination: &mut [V], indices: &mut [u32], vertices: &[V]) -> usize
 where
-    Vertex: Position + Copy,
+    V: Vertex + Copy,
 {
     assert!(indices.len().is_multiple_of(3));
 
diff --git a/src/vertex/mod.rs b/src/vertex/mod.rs
index d757b52..5f410ce 100644
--- a/src/vertex/mod.rs
+++ b/src/vertex/mod.rs
@@ -27,20 +27,30 @@ impl From<VertexEncodingVersion> for u8 {
     }
 }
 
-pub trait Position {
+pub trait Vertex<const ATTR_COUNT: usize = 0> {
+    const HAS_COLORS: bool = false;
+
     fn pos(&self) -> [f32; 3];
+
+    fn attrs(&self) -> [f32; ATTR_COUNT] {
+        [0f32; ATTR_COUNT]
+    }
+
+    fn colors(&self) -> [f32; 3] {
+        [0f32; 3]
+    }
 }
 
-impl Position for [f32; 3] {
+impl Vertex for [f32; 3] {
     #[inline]
     fn pos(&self) -> [f32; 3] {
         *self
     }
 }
 
-pub(crate) fn calc_pos_extents<Vertex>(vertices: &[Vertex]) -> ([f32; 3], f32)
+pub(crate) fn calc_pos_extents<V, const ATTR_COUNT: usize>(vertices: &[V]) -> ([f32; 3], f32)
 where
-    Vertex: Position,
+    V: Vertex<ATTR_COUNT>,
 {
     let mut minv = [f32::MAX; 3];
     let mut maxv = [-f32::MAX; 3];