From 22d1611800dba873298ec48a1d89b2ccea99de08 Mon Sep 17 00:00:00 2001 From: ethanrous Date: Sun, 7 Jun 2026 14:45:23 -0400 Subject: [PATCH 1/2] fix: correct VP8 IDCT pass order in WebP codec agno's inverse 4x4 DCT ran the row pass before the column pass. The VP8 reference (vp8_short_idct4x4llm_c / libwebp TransformOne) runs the column pass first, then the row pass (with the +4 rounding bias). The >>16 trig-term truncations are not associative across passes, so transposing them yields off-by-one reconstruction differences from any conformant decoder. agno's encoder reconstruction and its own decoder shared the wrong order, so round-tripping through agno looked correct and hid the bug. When agno's WebP is decoded by libwebp (browsers, Weblens), chroma drifts: chroma DC flows through this IDCT (there is no Y2/WHT path for chroma), so the +/-1 errors are not zero-mean on structured content and accumulate via intra prediction into a visible chroma cast (~+2 U/V on a 1024px crop, ~+6 on full frames). Luma DC goes through the order-independent WHT, so luma was barely affected -- matching the observed chroma-only signature. Swap both idct4x4 implementations (encoder transform.rs, decoder decode.rs) to column-first / row-second, matching the reference. Add a regression test pinning the pass order; a round-trip test cannot catch this because the encoder and decoder shared the deviation. --- agno/src/codec/webp/decode.rs | 40 +++++++++++--------- agno/src/codec/webp/transform.rs | 65 +++++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 36 deletions(-) diff --git a/agno/src/codec/webp/decode.rs b/agno/src/codec/webp/decode.rs index 578c3d6..93e64ab 100644 --- a/agno/src/codec/webp/decode.rs +++ b/agno/src/codec/webp/decode.rs @@ -685,44 +685,48 @@ fn decode_token_tree_vp8(dec: &mut BoolDecoder, probs: &[u8; 11]) -> u32 { // --------------------------------------------------------------------------- /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4). +/// +/// Matches the reference `vp8_short_idct4x4llm_c`: column pass first, row pass +/// second (with the `+4` rounding bias). See the encoder-side `transform::idct4x4` +/// for why the pass order is load-bearing. fn idct4x4(input: &[i32; 16]) -> [i32; 16] { let mut tmp = [0i32; 16]; - // Row pass (no rounding bias -- bias is added in column pass). + // Column pass. for i in 0..4 { - let c0 = input[i * 4]; - let c1 = input[i * 4 + 1]; - let c2 = input[i * 4 + 2]; - let c3 = input[i * 4 + 3]; + let c0 = input[i]; + let c1 = input[i + 4]; + let c2 = input[i + 8]; + let c3 = input[i + 12]; let a1 = c0 + c2; let b1 = c0 - c2; let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16); let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16); - tmp[i * 4] = a1 + temp2; - tmp[i * 4 + 3] = a1 - temp2; - tmp[i * 4 + 1] = b1 + temp1; - tmp[i * 4 + 2] = b1 - temp1; + tmp[i] = a1 + temp2; + tmp[i + 12] = a1 - temp2; + tmp[i + 4] = b1 + temp1; + tmp[i + 8] = b1 - temp1; } - // Column pass (with +4 rounding bias, then >>3 normalization). + // Row pass (with +4 rounding bias, then >>3 normalization). let mut result = [0i32; 16]; for i in 0..4 { - let c0 = tmp[i]; - let c1 = tmp[i + 4]; - let c2 = tmp[i + 8]; - let c3 = tmp[i + 12]; + let c0 = tmp[i * 4]; + let c1 = tmp[i * 4 + 1]; + let c2 = tmp[i * 4 + 2]; + let c3 = tmp[i * 4 + 3]; let a1 = c0 + c2; let b1 = c0 - c2; let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16); let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16); - result[i] = (a1 + temp2 + 4) >> 3; - result[i + 12] = (a1 - temp2 + 4) >> 3; - result[i + 4] = (b1 + temp1 + 4) >> 3; - result[i + 8] = (b1 - temp1 + 4) >> 3; + result[i * 4] = (a1 + temp2 + 4) >> 3; + result[i * 4 + 3] = (a1 - temp2 + 4) >> 3; + result[i * 4 + 1] = (b1 + temp1 + 4) >> 3; + result[i * 4 + 2] = (b1 - temp1 + 4) >> 3; } result } diff --git a/agno/src/codec/webp/transform.rs b/agno/src/codec/webp/transform.rs index 5b091b1..9ddd4ee 100644 --- a/agno/src/codec/webp/transform.rs +++ b/agno/src/codec/webp/transform.rs @@ -88,44 +88,50 @@ pub fn fwht4x4(input: &[i16; 16]) -> [i16; 16] { } /// Inverse 4x4 DCT (VP8 specification, Sections 14.3-14.4). +/// +/// Matches the reference `vp8_short_idct4x4llm_c`: the first pass operates on +/// columns, the second pass on rows (with the `+4` rounding bias). The pass +/// order matters because the `>>16` truncation in the trig terms is not +/// associative across passes — transposing the passes yields off-by-one +/// reconstruction differences from the reference decoder (libwebp). pub fn idct4x4(input: &[i32; 16]) -> [i32; 16] { let mut tmp = [0i32; 16]; - // Row pass. + // Column pass. for i in 0..4 { - let c0 = input[i * 4]; - let c1 = input[i * 4 + 1]; - let c2 = input[i * 4 + 2]; - let c3 = input[i * 4 + 3]; + let c0 = input[i]; + let c1 = input[i + 4]; + let c2 = input[i + 8]; + let c3 = input[i + 12]; let a1 = c0 + c2; let b1 = c0 - c2; let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16); let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16); - tmp[i * 4] = a1 + temp2; - tmp[i * 4 + 3] = a1 - temp2; - tmp[i * 4 + 1] = b1 + temp1; - tmp[i * 4 + 2] = b1 - temp1; + tmp[i] = a1 + temp2; + tmp[i + 12] = a1 - temp2; + tmp[i + 4] = b1 + temp1; + tmp[i + 8] = b1 - temp1; } - // Column pass (with +4 rounding bias, then >>3 normalization). + // Row pass (with +4 rounding bias, then >>3 normalization). let mut result = [0i32; 16]; for i in 0..4 { - let c0 = tmp[i]; - let c1 = tmp[i + 4]; - let c2 = tmp[i + 8]; - let c3 = tmp[i + 12]; + let c0 = tmp[i * 4]; + let c1 = tmp[i * 4 + 1]; + let c2 = tmp[i * 4 + 2]; + let c3 = tmp[i * 4 + 3]; let a1 = c0 + c2; let b1 = c0 - c2; let temp1 = ((c1 * 35468) >> 16) - c3 - ((c3 * 20091) >> 16); let temp2 = c1 + ((c1 * 20091) >> 16) + ((c3 * 35468) >> 16); - result[i] = (a1 + temp2 + 4) >> 3; - result[i + 12] = (a1 - temp2 + 4) >> 3; - result[i + 4] = (b1 + temp1 + 4) >> 3; - result[i + 8] = (b1 - temp1 + 4) >> 3; + result[i * 4] = (a1 + temp2 + 4) >> 3; + result[i * 4 + 3] = (a1 - temp2 + 4) >> 3; + result[i * 4 + 1] = (b1 + temp1 + 4) >> 3; + result[i * 4 + 2] = (b1 - temp1 + 4) >> 3; } result } @@ -158,3 +164,26 @@ pub fn iwht4x4(input: &[i16; 16]) -> [i16; 16] { } result } + +#[cfg(test)] +mod tests { + use super::*; + + /// Pins `idct4x4` to the VP8 reference pass order (column pass first, then + /// row pass). The `>>16` trig truncations are not associative across passes, + /// so a row-first IDCT yields off-by-one results that a conformant decoder + /// (libwebp) does not share — which manifested as an accumulating chroma + /// cast in WebP output. This block is chosen so the two pass orders differ; + /// the expected values are the reference (column-first) output. A regression + /// to row-first changes indices 0, 3, 5, and 7. + #[test] + fn idct4x4_uses_vp8_reference_pass_order() { + let input = [ + 120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2, + ]; + let expected = [ + 20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15, + ]; + assert_eq!(idct4x4(&input), expected); + } +} From b1f5de3513d3f8aee65458b702c1baaf55f55076 Mon Sep 17 00:00:00 2001 From: ethanrous Date: Sun, 7 Jun 2026 17:43:37 -0400 Subject: [PATCH 2/2] style: rustfmt the idct4x4 regression test arrays --- agno/src/codec/webp/transform.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/agno/src/codec/webp/transform.rs b/agno/src/codec/webp/transform.rs index 9ddd4ee..a555a9b 100644 --- a/agno/src/codec/webp/transform.rs +++ b/agno/src/codec/webp/transform.rs @@ -178,12 +178,8 @@ mod tests { /// to row-first changes indices 0, 3, 5, and 7. #[test] fn idct4x4_uses_vp8_reference_pass_order() { - let input = [ - 120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2, - ]; - let expected = [ - 20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15, - ]; + let input = [120, 35, -18, 7, -22, 14, 9, -5, 11, -8, 4, 3, -6, 2, -1, 2]; + let expected = [20, 15, 10, 4, 19, 19, 12, 4, 17, 19, 17, 4, 20, 22, 26, 15]; assert_eq!(idct4x4(&input), expected); } }