ethanrous · ethanrous · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/agno/src/codec/webp/decode.rs b/agno/src/codec/webp/decode.rs
@@ -685,44 +685,48 @@ fn decode_token_tree_vp8(dec: &mut BoolDecoder, probs: &[u8; 11]) -> u32 {
 // ---------------------------------------------------------------------------
 
 /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4).
+///
+/// Matches the reference `vp8_short_idct4x4llm_c`: column pass first, row pass
+/// second (with the `+4` rounding bias). See the encoder-side `transform::idct4x4`
+/// for why the pass order is load-bearing.
 fn idct4x4(input: &[i32; 16]) -> [i32; 16] {
     let mut tmp = [0i32; 16];
 
-    // Row pass (no rounding bias -- bias is added in column pass).
+    // Column pass.
     for i in 0..4 {
-        let c0 = input[i * 4];
-        let c1 = input[i * 4 + 1];
-        let c2 = input[i * 4 + 2];
-        let c3 = input[i * 4 + 3];
+        let c0 = input[i];
+        let c1 = input[i + 4];
+        let c2 = input[i + 8];
+        let c3 = input[i + 12];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        tmp[i * 4] = a1 + temp2;
-        tmp[i * 4 + 3] = a1 - temp2;
-        tmp[i * 4 + 1] = b1 + temp1;
-        tmp[i * 4 + 2] = b1 - temp1;
+        tmp[i] = a1 + temp2;
+        tmp[i + 12] = a1 - temp2;
+        tmp[i + 4] = b1 + temp1;
+        tmp[i + 8] = b1 - temp1;
     }
 
-    // Column pass (with +4 rounding bias, then >>3 normalization).
+    // Row pass (with +4 rounding bias, then >>3 normalization).
     let mut result = [0i32; 16];
     for i in 0..4 {
-        let c0 = tmp[i];
-        let c1 = tmp[i + 4];
-        let c2 = tmp[i + 8];
-        let c3 = tmp[i + 12];
+        let c0 = tmp[i * 4];
+        let c1 = tmp[i * 4 + 1];
+        let c2 = tmp[i * 4 + 2];
+        let c3 = tmp[i * 4 + 3];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        result[i] = (a1 + temp2 + 4) >> 3;
-        result[i + 12] = (a1 - temp2 + 4) >> 3;
-        result[i + 4] = (b1 + temp1 + 4) >> 3;
-        result[i + 8] = (b1 - temp1 + 4) >> 3;
+        result[i * 4] = (a1 + temp2 + 4) >> 3;
+        result[i * 4 + 3] = (a1 - temp2 + 4) >> 3;
+        result[i * 4 + 1] = (b1 + temp1 + 4) >> 3;
+        result[i * 4 + 2] = (b1 - temp1 + 4) >> 3;
     }
     result
 }

diff --git a/agno/src/codec/webp/transform.rs b/agno/src/codec/webp/transform.rs
@@ -88,44 +88,50 @@ pub fn fwht4x4(input: &[i16; 16]) -> [i16; 16] {
 }
 
 /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4).
+///
+/// Matches the reference `vp8_short_idct4x4llm_c`: the first pass operates on
+/// columns, the second pass on rows (with the `+4` rounding bias). The pass
+/// order matters because the `>>16` truncation in the trig terms is not
+/// associative across passes — transposing the passes yields off-by-one
+/// reconstruction differences from the reference decoder (libwebp).
 pub fn idct4x4(input: &[i32; 16]) -> [i32; 16] {
     let mut tmp = [0i32; 16];
 
-    // Row pass.
+    // Column pass.
     for i in 0..4 {
-        let c0 = input[i * 4];
-        let c1 = input[i * 4 + 1];
-        let c2 = input[i * 4 + 2];
-        let c3 = input[i * 4 + 3];
+        let c0 = input[i];
+        let c1 = input[i + 4];
+        let c2 = input[i + 8];
+        let c3 = input[i + 12];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        tmp[i * 4] = a1 + temp2;
-        tmp[i * 4 + 3] = a1 - temp2;
-        tmp[i * 4 + 1] = b1 + temp1;
-        tmp[i * 4 + 2] = b1 - temp1;
+        tmp[i] = a1 + temp2;
+        tmp[i + 12] = a1 - temp2;
+        tmp[i + 4] = b1 + temp1;
+        tmp[i + 8] = b1 - temp1;
     }
 
-    // Column pass (with +4 rounding bias, then >>3 normalization).
+    // Row pass (with +4 rounding bias, then >>3 normalization).
     let mut result = [0i32; 16];
     for i in 0..4 {
-        let c0 = tmp[i];
-        let c1 = tmp[i + 4];
-        let c2 = tmp[i + 8];
-        let c3 = tmp[i + 12];
+        let c0 = tmp[i * 4];
+        let c1 = tmp[i * 4 + 1];
+        let c2 = tmp[i * 4 + 2];
+        let c3 = tmp[i * 4 + 3];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        result[i] = (a1 + temp2 + 4) >> 3;
-        result[i + 12] = (a1 - temp2 + 4) >> 3;
-        result[i + 4] = (b1 + temp1 + 4) >> 3;
-        result[i + 8] = (b1 - temp1 + 4) >> 3;
+        result[i * 4] = (a1 + temp2 + 4) >> 3;
+        result[i * 4 + 3] = (a1 - temp2 + 4) >> 3;
+        result[i * 4 + 1] = (b1 + temp1 + 4) >> 3;
+        result[i * 4 + 2] = (b1 - temp1 + 4) >> 3;
     }
     result
 }
@@ -158,3 +164,22 @@ pub fn iwht4x4(input: &[i16; 16]) -> [i16; 16] {
     }
     result
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Pins `idct4x4` to the VP8 reference pass order (column pass first, then
+    /// row pass). The `>>16` trig truncations are not associative across passes,
+    /// so a row-first IDCT yields off-by-one results that a conformant decoder
+    /// (libwebp) does not share — which manifested as an accumulating chroma
+    /// cast in WebP output. This block is chosen so the two pass orders differ;
+    /// the expected values are the reference (column-first) output. A regression
+    /// to row-first changes indices 0, 3, 5, and 7.
+    #[test]
+    fn idct4x4_uses_vp8_reference_pass_order() {
+        let input = [120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2];
+        let expected = [20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15];
+        assert_eq!(idct4x4(&input), expected);
+    }
+}