From 22d1611800dba873298ec48a1d89b2ccea99de08 Mon Sep 17 00:00:00 2001
From: ethanrous <ethanrousseau99+github@gmail.com>
Date: Sun, 7 Jun 2026 14:45:23 -0400
Subject: [PATCH 1/2] fix: correct VP8 IDCT pass order in WebP codec

agno's inverse 4x4 DCT ran the row pass before the column pass. The VP8
reference (vp8_short_idct4x4llm_c / libwebp TransformOne) runs the column
pass first, then the row pass (with the +4 rounding bias). The >>16
trig-term truncations are not associative across passes, so transposing
them yields off-by-one reconstruction differences from any conformant
decoder.

agno's encoder reconstruction and its own decoder shared the wrong order,
so round-tripping through agno looked correct and hid the bug. When agno's
WebP is decoded by libwebp (browsers, Weblens), chroma drifts: chroma DC
flows through this IDCT (there is no Y2/WHT path for chroma), so the +/-1
errors are not zero-mean on structured content and accumulate via intra
prediction into a visible chroma cast (~+2 U/V on a 1024px crop, ~+6 on
full frames). Luma DC goes through the order-independent WHT, so luma was
barely affected -- matching the observed chroma-only signature.

Swap both idct4x4 implementations (encoder transform.rs, decoder decode.rs)
to column-first / row-second, matching the reference. Add a regression test
pinning the pass order; a round-trip test cannot catch this because the
encoder and decoder shared the deviation.
---
 agno/src/codec/webp/decode.rs    | 40 +++++++++++---------
 agno/src/codec/webp/transform.rs | 65 +++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/agno/src/codec/webp/decode.rs b/agno/src/codec/webp/decode.rs
index 578c3d6..93e64ab 100644
--- a/agno/src/codec/webp/decode.rs
+++ b/agno/src/codec/webp/decode.rs
@@ -685,44 +685,48 @@ fn decode_token_tree_vp8(dec: &mut BoolDecoder, probs: &[u8; 11]) -> u32 {
 // ---------------------------------------------------------------------------
 
 /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4).
+///
+/// Matches the reference `vp8_short_idct4x4llm_c`: column pass first, row pass
+/// second (with the `+4` rounding bias). See the encoder-side `transform::idct4x4`
+/// for why the pass order is load-bearing.
 fn idct4x4(input: &[i32; 16]) -> [i32; 16] {
     let mut tmp = [0i32; 16];
 
-    // Row pass (no rounding bias -- bias is added in column pass).
+    // Column pass.
     for i in 0..4 {
-        let c0 = input[i * 4];
-        let c1 = input[i * 4 + 1];
-        let c2 = input[i * 4 + 2];
-        let c3 = input[i * 4 + 3];
+        let c0 = input[i];
+        let c1 = input[i + 4];
+        let c2 = input[i + 8];
+        let c3 = input[i + 12];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        tmp[i * 4] = a1 + temp2;
-        tmp[i * 4 + 3] = a1 - temp2;
-        tmp[i * 4 + 1] = b1 + temp1;
-        tmp[i * 4 + 2] = b1 - temp1;
+        tmp[i] = a1 + temp2;
+        tmp[i + 12] = a1 - temp2;
+        tmp[i + 4] = b1 + temp1;
+        tmp[i + 8] = b1 - temp1;
     }
 
-    // Column pass (with +4 rounding bias, then >>3 normalization).
+    // Row pass (with +4 rounding bias, then >>3 normalization).
     let mut result = [0i32; 16];
     for i in 0..4 {
-        let c0 = tmp[i];
-        let c1 = tmp[i + 4];
-        let c2 = tmp[i + 8];
-        let c3 = tmp[i + 12];
+        let c0 = tmp[i * 4];
+        let c1 = tmp[i * 4 + 1];
+        let c2 = tmp[i * 4 + 2];
+        let c3 = tmp[i * 4 + 3];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        result[i] = (a1 + temp2 + 4) >> 3;
-        result[i + 12] = (a1 - temp2 + 4) >> 3;
-        result[i + 4] = (b1 + temp1 + 4) >> 3;
-        result[i + 8] = (b1 - temp1 + 4) >> 3;
+        result[i * 4] = (a1 + temp2 + 4) >> 3;
+        result[i * 4 + 3] = (a1 - temp2 + 4) >> 3;
+        result[i * 4 + 1] = (b1 + temp1 + 4) >> 3;
+        result[i * 4 + 2] = (b1 - temp1 + 4) >> 3;
     }
     result
 }
diff --git a/agno/src/codec/webp/transform.rs b/agno/src/codec/webp/transform.rs
index 5b091b1..9ddd4ee 100644
--- a/agno/src/codec/webp/transform.rs
+++ b/agno/src/codec/webp/transform.rs
@@ -88,44 +88,50 @@ pub fn fwht4x4(input: &[i16; 16]) -> [i16; 16] {
 }
 
 /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4).
+///
+/// Matches the reference `vp8_short_idct4x4llm_c`: the first pass operates on
+/// columns, the second pass on rows (with the `+4` rounding bias). The pass
+/// order matters because the `>>16` truncation in the trig terms is not
+/// associative across passes — transposing the passes yields off-by-one
+/// reconstruction differences from the reference decoder (libwebp).
 pub fn idct4x4(input: &[i32; 16]) -> [i32; 16] {
     let mut tmp = [0i32; 16];
 
-    // Row pass.
+    // Column pass.
     for i in 0..4 {
-        let c0 = input[i * 4];
-        let c1 = input[i * 4 + 1];
-        let c2 = input[i * 4 + 2];
-        let c3 = input[i * 4 + 3];
+        let c0 = input[i];
+        let c1 = input[i + 4];
+        let c2 = input[i + 8];
+        let c3 = input[i + 12];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        tmp[i * 4] = a1 + temp2;
-        tmp[i * 4 + 3] = a1 - temp2;
-        tmp[i * 4 + 1] = b1 + temp1;
-        tmp[i * 4 + 2] = b1 - temp1;
+        tmp[i] = a1 + temp2;
+        tmp[i + 12] = a1 - temp2;
+        tmp[i + 4] = b1 + temp1;
+        tmp[i + 8] = b1 - temp1;
     }
 
-    // Column pass (with +4 rounding bias, then >>3 normalization).
+    // Row pass (with +4 rounding bias, then >>3 normalization).
     let mut result = [0i32; 16];
     for i in 0..4 {
-        let c0 = tmp[i];
-        let c1 = tmp[i + 4];
-        let c2 = tmp[i + 8];
-        let c3 = tmp[i + 12];
+        let c0 = tmp[i * 4];
+        let c1 = tmp[i * 4 + 1];
+        let c2 = tmp[i * 4 + 2];
+        let c3 = tmp[i * 4 + 3];
 
         let a1 = c0 + c2;
         let b1 = c0 - c2;
         let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16);
         let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16);
 
-        result[i] = (a1 + temp2 + 4) >> 3;
-        result[i + 12] = (a1 - temp2 + 4) >> 3;
-        result[i + 4] = (b1 + temp1 + 4) >> 3;
-        result[i + 8] = (b1 - temp1 + 4) >> 3;
+        result[i * 4] = (a1 + temp2 + 4) >> 3;
+        result[i * 4 + 3] = (a1 - temp2 + 4) >> 3;
+        result[i * 4 + 1] = (b1 + temp1 + 4) >> 3;
+        result[i * 4 + 2] = (b1 - temp1 + 4) >> 3;
     }
     result
 }
@@ -158,3 +164,26 @@ pub fn iwht4x4(input: &[i16; 16]) -> [i16; 16] {
     }
     result
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Pins `idct4x4` to the VP8 reference pass order (column pass first, then
+    /// row pass). The `>>16` trig truncations are not associative across passes,
+    /// so a row-first IDCT yields off-by-one results that a conformant decoder
+    /// (libwebp) does not share — which manifested as an accumulating chroma
+    /// cast in WebP output. This block is chosen so the two pass orders differ;
+    /// the expected values are the reference (column-first) output. A regression
+    /// to row-first changes indices 0, 3, 5, and 7.
+    #[test]
+    fn idct4x4_uses_vp8_reference_pass_order() {
+        let input = [
+            120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2,
+        ];
+        let expected = [
+            20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15,
+        ];
+        assert_eq!(idct4x4(&input), expected);
+    }
+}

From b1f5de3513d3f8aee65458b702c1baaf55f55076 Mon Sep 17 00:00:00 2001
From: ethanrous <ethanrousseau99+github@gmail.com>
Date: Sun, 7 Jun 2026 17:43:37 -0400
Subject: [PATCH 2/2] style: rustfmt the idct4x4 regression test arrays

---
 agno/src/codec/webp/transform.rs | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/agno/src/codec/webp/transform.rs b/agno/src/codec/webp/transform.rs
index 9ddd4ee..a555a9b 100644
--- a/agno/src/codec/webp/transform.rs
+++ b/agno/src/codec/webp/transform.rs
@@ -178,12 +178,8 @@ mod tests {
     /// to row-first changes indices 0, 3, 5, and 7.
     #[test]
     fn idct4x4_uses_vp8_reference_pass_order() {
-        let input = [
-            120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2,
-        ];
-        let expected = [
-            20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15,
-        ];
+        let input = [120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2];
+        let expected = [20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15];
         assert_eq!(idct4x4(&input), expected);
     }
 }