diff --git a/.seed/verifier/1990-04apr0106-page14.bundle.json b/.seed/verifier/1990-04apr0106-page14.bundle.json index 6e5edf6..d3efc19 100644 --- a/.seed/verifier/1990-04apr0106-page14.bundle.json +++ b/.seed/verifier/1990-04apr0106-page14.bundle.json @@ -32,9 +32,9 @@ "oddities": [], "row_bbox": [ 0, - 576, + 651, 1266, - 651 + 726 ] }, { @@ -46,9 +46,9 @@ "oddities": [], "row_bbox": [ 0, - 651, + 726, 1266, - 726 + 801 ] }, { @@ -62,9 +62,9 @@ ], "row_bbox": [ 0, - 726, + 801, 1266, - 801 + 876 ] }, { @@ -76,9 +76,9 @@ "oddities": [], "row_bbox": [ 0, - 801, + 876, 1266, - 876 + 951 ] }, { @@ -90,9 +90,9 @@ "oddities": [], "row_bbox": [ 0, - 876, + 951, 1266, - 951 + 1026 ] }, { @@ -106,9 +106,9 @@ ], "row_bbox": [ 0, - 951, + 1026, 1266, - 1026 + 1101 ] }, { @@ -122,9 +122,9 @@ ], "row_bbox": [ 0, - 1026, + 1101, 1266, - 1101 + 1176 ] }, { @@ -136,9 +136,9 @@ "oddities": [], "row_bbox": [ 0, - 1101, + 1176, 1266, - 1176 + 1328 ] }, { @@ -150,9 +150,9 @@ "oddities": [], "row_bbox": [ 0, - 1176, + 1328, 1266, - 1251 + 1403 ] }, { @@ -164,9 +164,9 @@ "oddities": [], "row_bbox": [ 0, - 1251, + 1403, 1266, - 1326 + 1478 ] }, { @@ -178,9 +178,9 @@ "oddities": [], "row_bbox": [ 0, - 1326, + 1478, 1266, - 1401 + 1553 ] }, { @@ -194,9 +194,9 @@ ], "row_bbox": [ 0, - 1401, + 1553, 1266, - 1551 + 1779 ] }, { @@ -208,9 +208,9 @@ "oddities": [], "row_bbox": [ 0, - 1551, + 1779, 1266, - 1626 + 1854 ] }, { @@ -222,9 +222,9 @@ "oddities": [], "row_bbox": [ 0, - 1626, + 1854, 1266, - 1701 + 1932 ] }, { @@ -236,9 +236,9 @@ "oddities": [], "row_bbox": [ 0, - 1701, + 1932, 1266, - 1776 + 2007 ] }, { @@ -250,9 +250,9 @@ "oddities": [], "row_bbox": [ 0, - 1776, + 2007, 1266, - 1851 + 2082 ] }, { @@ -264,9 +264,9 @@ "oddities": [], "row_bbox": [ 0, - 1851, + 2082, 1266, - 1926 + 2158 ] }, { @@ -278,9 +278,9 @@ "oddities": [], "row_bbox": [ 0, - 1926, + 2158, 1266, - 2001 + 2256 ] } ], diff --git a/.seed/verifier/1990-04apr0106-page29.bundle.json b/.seed/verifier/1990-04apr0106-page29.bundle.json index 0f164e0..e8bf60c 100644 --- a/.seed/verifier/1990-04apr0106-page29.bundle.json +++ b/.seed/verifier/1990-04apr0106-page29.bundle.json @@ -329,7 +329,7 @@ 1270, 549, 2550, - 624 + 875 ] }, { @@ -341,9 +341,9 @@ "oddities": [], "row_bbox": [ 1270, - 624, + 875, 2550, - 774 + 1026 ] }, { @@ -355,9 +355,9 @@ "oddities": [], "row_bbox": [ 1270, - 774, + 1026, 2550, - 849 + 1101 ] }, { @@ -369,9 +369,9 @@ "oddities": [], "row_bbox": [ 1270, - 849, + 1101, 2550, - 924 + 1176 ] }, { @@ -383,9 +383,9 @@ "oddities": [], "row_bbox": [ 1270, - 924, + 1176, 2550, - 999 + 1253 ] }, { @@ -397,9 +397,9 @@ "oddities": [], "row_bbox": [ 1270, - 999, + 1253, 2550, - 1074 + 1328 ] }, { @@ -411,9 +411,9 @@ "oddities": [], "row_bbox": [ 1270, - 1074, + 1328, 2550, - 1149 + 1403 ] }, { @@ -427,9 +427,9 @@ ], "row_bbox": [ 1270, - 1149, + 1403, 2550, - 1224 + 1478 ] }, { @@ -441,9 +441,9 @@ "oddities": [], "row_bbox": [ 1270, - 1224, + 1478, 2550, - 1299 + 1554 ] }, { @@ -455,9 +455,9 @@ "oddities": [], "row_bbox": [ 1270, - 1299, + 1554, 2550, - 1374 + 1628 ] }, { @@ -469,9 +469,9 @@ "oddities": [], "row_bbox": [ 1270, - 1374, + 1628, 2550, - 1449 + 1704 ] }, { @@ -483,9 +483,9 @@ "oddities": [], "row_bbox": [ 1270, - 1449, + 1704, 2550, - 1524 + 1779 ] }, { @@ -497,9 +497,9 @@ "oddities": [], "row_bbox": [ 1270, - 1524, + 1779, 2550, - 1599 + 1853 ] }, { @@ -511,9 +511,9 @@ "oddities": [], "row_bbox": [ 1270, - 1599, + 1853, 2550, - 1674 + 1928 ] }, { @@ -525,9 +525,9 @@ "oddities": [], "row_bbox": [ 1270, - 1674, + 1928, 2550, - 1749 + 2003 ] }, { @@ -541,9 +541,9 @@ ], "row_bbox": [ 1270, - 1749, + 2003, 2550, - 1824 + 2079 ] }, { @@ -555,9 +555,9 @@ "oddities": [], "row_bbox": [ 1270, - 1824, + 2079, 2550, - 1899 + 2152 ] }, { @@ -569,9 +569,9 @@ "oddities": [], "row_bbox": [ 1270, - 1899, + 2152, 2550, - 1974 + 2253 ] } ], diff --git a/.seed/verifier/1990-04apr0106-page34.bundle.json b/.seed/verifier/1990-04apr0106-page34.bundle.json index 201308c..8117ed6 100644 --- a/.seed/verifier/1990-04apr0106-page34.bundle.json +++ b/.seed/verifier/1990-04apr0106-page34.bundle.json @@ -32,7 +32,7 @@ 0, 542, 1282, - 617 + 643 ] }, { @@ -44,9 +44,9 @@ "oddities": [], "row_bbox": [ 0, - 617, + 643, 1282, - 692 + 718 ] }, { @@ -58,9 +58,9 @@ "oddities": [], "row_bbox": [ 0, - 692, + 718, 1282, - 767 + 794 ] }, { @@ -72,9 +72,9 @@ "oddities": [], "row_bbox": [ 0, - 767, + 794, 1282, - 842 + 868 ] }, { @@ -86,9 +86,9 @@ "oddities": [], "row_bbox": [ 0, - 842, + 868, 1282, - 917 + 944 ] }, { @@ -100,9 +100,9 @@ "oddities": [], "row_bbox": [ 0, - 917, + 944, 1282, - 992 + 1019 ] }, { @@ -116,9 +116,9 @@ ], "row_bbox": [ 0, - 992, + 1019, 1282, - 1067 + 1094 ] }, { @@ -130,9 +130,9 @@ "oddities": [], "row_bbox": [ 0, - 1067, + 1094, 1282, - 1142 + 1169 ] }, { @@ -144,9 +144,9 @@ "oddities": [], "row_bbox": [ 0, - 1142, + 1169, 1282, - 1217 + 1245 ] }, { @@ -158,9 +158,9 @@ "oddities": [], "row_bbox": [ 0, - 1217, + 1245, 1282, - 1367 + 1395 ] }, { @@ -172,9 +172,9 @@ "oddities": [], "row_bbox": [ 0, - 1367, + 1395, 1282, - 1442 + 1471 ] }, { @@ -186,9 +186,9 @@ "oddities": [], "row_bbox": [ 0, - 1442, + 1471, 1282, - 1592 + 1621 ] }, { @@ -200,9 +200,9 @@ "oddities": [], "row_bbox": [ 0, - 1592, + 1621, 1282, - 1667 + 1696 ] }, { @@ -214,9 +214,9 @@ "oddities": [], "row_bbox": [ 0, - 1667, + 1696, 1282, - 1742 + 1772 ] }, { @@ -228,9 +228,9 @@ "oddities": [], "row_bbox": [ 0, - 1742, + 1772, 1282, - 1817 + 1846 ] }, { @@ -242,9 +242,9 @@ "oddities": [], "row_bbox": [ 0, - 1817, + 1846, 1282, - 1892 + 1921 ] }, { @@ -256,9 +256,9 @@ "oddities": [], "row_bbox": [ 0, - 1892, + 1921, 1282, - 1967 + 1996 ] }, { @@ -272,9 +272,9 @@ ], "row_bbox": [ 0, - 1967, + 1996, 1282, - 2042 + 2072 ] }, { @@ -286,9 +286,9 @@ "oddities": [], "row_bbox": [ 0, - 2042, + 2072, 1282, - 2117 + 2146 ] }, { @@ -300,9 +300,9 @@ "oddities": [], "row_bbox": [ 0, - 2117, + 2146, 1282, - 2192 + 2247 ] } ], diff --git a/.seed/verifier/1990-04apr0712-page14.bundle.json b/.seed/verifier/1990-04apr0712-page14.bundle.json index 2391567..7d47e72 100644 --- a/.seed/verifier/1990-04apr0712-page14.bundle.json +++ b/.seed/verifier/1990-04apr0712-page14.bundle.json @@ -32,7 +32,7 @@ "oddities": [], "row_bbox": [ 0, - 584, + 583, 1277, 659 ] @@ -48,7 +48,7 @@ 0, 659, 1277, - 734 + 735 ] }, { @@ -60,9 +60,9 @@ "oddities": [], "row_bbox": [ 0, - 734, + 735, 1277, - 809 + 811 ] }, { @@ -74,9 +74,9 @@ "oddities": [], "row_bbox": [ 0, - 809, + 811, 1277, - 884 + 887 ] }, { @@ -90,9 +90,9 @@ ], "row_bbox": [ 0, - 884, + 887, 1277, - 1034 + 1039 ] }, { @@ -106,9 +106,9 @@ ], "row_bbox": [ 0, - 1034, + 1039, 1277, - 1109 + 1115 ] }, { @@ -120,9 +120,9 @@ "oddities": [], "row_bbox": [ 0, - 1109, + 1115, 1277, - 1184 + 1191 ] }, { @@ -134,9 +134,9 @@ "oddities": [], "row_bbox": [ 0, - 1184, + 1191, 1277, - 1259 + 1267 ] }, { @@ -148,9 +148,9 @@ "oddities": [], "row_bbox": [ 0, - 1259, + 1267, 1277, - 1334 + 1343 ] }, { @@ -162,9 +162,9 @@ "oddities": [], "row_bbox": [ 0, - 1334, + 1343, 1277, - 1409 + 1419 ] }, { @@ -176,9 +176,9 @@ "oddities": [], "row_bbox": [ 0, - 1409, + 1419, 1277, - 1484 + 1495 ] }, { @@ -190,9 +190,9 @@ "oddities": [], "row_bbox": [ 0, - 1484, + 1495, 1277, - 1559 + 1571 ] }, { @@ -206,9 +206,9 @@ ], "row_bbox": [ 0, - 1559, + 1571, 1277, - 1634 + 1647 ] }, { @@ -222,9 +222,9 @@ ], "row_bbox": [ 0, - 1634, + 1647, 1277, - 1709 + 1723 ] }, { @@ -236,9 +236,9 @@ "oddities": [], "row_bbox": [ 0, - 1709, + 1723, 1277, - 1784 + 1799 ] }, { @@ -252,9 +252,9 @@ ], "row_bbox": [ 0, - 1784, + 1799, 1277, - 1934 + 1951 ] } ], diff --git a/.seed/verifier/1990-04apr0712-page24.bundle.json b/.seed/verifier/1990-04apr0712-page24.bundle.json index a341beb..f3284ea 100644 --- a/.seed/verifier/1990-04apr0712-page24.bundle.json +++ b/.seed/verifier/1990-04apr0712-page24.bundle.json @@ -190,9 +190,9 @@ "oddities": [], "row_bbox": [ 1282, - 578, + 654, 2550, - 654 + 804 ] }, { @@ -204,9 +204,9 @@ "oddities": [], "row_bbox": [ 1282, - 654, + 804, 2550, - 730 + 954 ] }, { @@ -218,9 +218,9 @@ "oddities": [], "row_bbox": [ 1282, - 730, + 954, 2550, - 806 + 1104 ] }, { @@ -232,9 +232,9 @@ "oddities": [], "row_bbox": [ 1282, - 806, + 1104, 2550, - 882 + 1254 ] }, { @@ -246,9 +246,9 @@ "oddities": [], "row_bbox": [ 1282, - 882, + 1254, 2550, - 958 + 1404 ] }, { @@ -260,9 +260,9 @@ "oddities": [], "row_bbox": [ 1282, - 958, + 1404, 2550, - 1034 + 1554 ] }, { @@ -274,9 +274,9 @@ "oddities": [], "row_bbox": [ 1282, - 1034, + 1554, 2550, - 1110 + 1704 ] }, { @@ -288,9 +288,9 @@ "oddities": [], "row_bbox": [ 1282, - 1110, + 1704, 2550, - 1186 + 1854 ] }, { @@ -302,9 +302,9 @@ "oddities": [], "row_bbox": [ 1282, - 1186, + 1854, 2550, - 1262 + 2004 ] }, { @@ -316,9 +316,9 @@ "oddities": [], "row_bbox": [ 1282, - 1262, + 2004, 2550, - 1338 + 2154 ] }, { @@ -330,9 +330,9 @@ "oddities": [], "row_bbox": [ 1282, - 1338, + 2154, 2550, - 1490 + 2302 ] }, { @@ -344,9 +344,9 @@ "oddities": [], "row_bbox": [ 1282, - 1490, + 2302, 2550, - 1566 + 2302 ] }, { @@ -358,9 +358,9 @@ "oddities": [], "row_bbox": [ 1282, - 1566, + 2302, 2550, - 1642 + 2302 ] } ], diff --git a/.seed/verifier/1990-04apr2430-page23.bundle.json b/.seed/verifier/1990-04apr2430-page23.bundle.json index 6c90ad9..e643bea 100644 --- a/.seed/verifier/1990-04apr2430-page23.bundle.json +++ b/.seed/verifier/1990-04apr2430-page23.bundle.json @@ -34,7 +34,7 @@ 0, 546, 1264, - 622 + 648 ] }, { @@ -46,9 +46,9 @@ "oddities": [], "row_bbox": [ 0, - 622, + 648, 1264, - 774 + 798 ] }, { @@ -60,9 +60,9 @@ "oddities": [], "row_bbox": [ 0, - 774, + 798, 1264, - 850 + 872 ] }, { @@ -74,9 +74,9 @@ "oddities": [], "row_bbox": [ 0, - 850, + 872, 1264, - 926 + 948 ] }, { @@ -88,9 +88,9 @@ "oddities": [], "row_bbox": [ 0, - 926, + 948, 1264, - 1078 + 1098 ] }, { @@ -102,9 +102,9 @@ "oddities": [], "row_bbox": [ 0, - 1078, + 1098, 1264, - 1154 + 1174 ] }, { @@ -116,9 +116,9 @@ "oddities": [], "row_bbox": [ 0, - 1154, + 1174, 1264, - 1306 + 1326 ] }, { @@ -132,9 +132,9 @@ ], "row_bbox": [ 0, - 1306, + 1326, 1264, - 1382 + 1400 ] }, { @@ -148,9 +148,9 @@ ], "row_bbox": [ 0, - 1382, + 1400, 1264, - 1458 + 1476 ] }, { @@ -162,9 +162,9 @@ "oddities": [], "row_bbox": [ 0, - 1458, + 1476, 1264, - 1534 + 1551 ] }, { @@ -176,9 +176,9 @@ "oddities": [], "row_bbox": [ 0, - 1534, + 1551, 1264, - 1610 + 1626 ] }, { @@ -190,9 +190,9 @@ "oddities": [], "row_bbox": [ 0, - 1610, + 1626, 1264, - 1686 + 1702 ] }, { @@ -206,9 +206,9 @@ ], "row_bbox": [ 0, - 1686, + 1702, 1264, - 1838 + 2001 ] }, { @@ -220,9 +220,9 @@ "oddities": [], "row_bbox": [ 0, - 1838, + 2001, 1264, - 1990 + 2254 ] } ], diff --git a/.seed/verifier/1990-04apr2430-page29.bundle.json b/.seed/verifier/1990-04apr2430-page29.bundle.json index 16ac4b7..44e5c89 100644 --- a/.seed/verifier/1990-04apr2430-page29.bundle.json +++ b/.seed/verifier/1990-04apr2430-page29.bundle.json @@ -34,7 +34,7 @@ 0, 548, 1281, - 624 + 651 ] }, { @@ -46,9 +46,9 @@ "oddities": [], "row_bbox": [ 0, - 624, + 651, 1281, - 700 + 754 ] }, { @@ -60,9 +60,9 @@ "oddities": [], "row_bbox": [ 0, - 700, + 754, 1281, - 776 + 857 ] }, { @@ -74,9 +74,9 @@ "oddities": [], "row_bbox": [ 0, - 776, + 857, 1281, - 852 + 960 ] }, { @@ -90,9 +90,9 @@ ], "row_bbox": [ 0, - 852, + 960, 1281, - 928 + 1063 ] }, { @@ -104,9 +104,9 @@ "oddities": [], "row_bbox": [ 0, - 928, + 1063, 1281, - 1004 + 1166 ] }, { @@ -118,9 +118,9 @@ "oddities": [], "row_bbox": [ 0, - 1004, + 1166, 1281, - 1080 + 1269 ] }, { @@ -132,9 +132,9 @@ "oddities": [], "row_bbox": [ 0, - 1080, + 1269, 1281, - 1156 + 1372 ] }, { @@ -146,9 +146,9 @@ "oddities": [], "row_bbox": [ 0, - 1156, + 1372, 1281, - 1232 + 1475 ] }, { @@ -160,9 +160,9 @@ "oddities": [], "row_bbox": [ 0, - 1232, + 1475, 1281, - 1308 + 1578 ] }, { @@ -174,9 +174,9 @@ "oddities": [], "row_bbox": [ 0, - 1308, + 1578, 1281, - 1384 + 1681 ] }, { @@ -188,9 +188,9 @@ "oddities": [], "row_bbox": [ 0, - 1384, + 1681, 1281, - 1460 + 1784 ] }, { @@ -202,9 +202,9 @@ "oddities": [], "row_bbox": [ 0, - 1460, + 1784, 1281, - 1536 + 1887 ] }, { @@ -216,9 +216,9 @@ "oddities": [], "row_bbox": [ 0, - 1536, + 1887, 1281, - 1612 + 1990 ] }, { @@ -230,9 +230,9 @@ "oddities": [], "row_bbox": [ 0, - 1612, + 1990, 1281, - 1688 + 2093 ] }, { @@ -244,9 +244,9 @@ "oddities": [], "row_bbox": [ 0, - 1688, + 2093, 1281, - 1764 + 2196 ] }, { @@ -258,9 +258,9 @@ "oddities": [], "row_bbox": [ 0, - 1764, + 2196, 1281, - 1840 + 2299 ] }, { @@ -272,9 +272,9 @@ "oddities": [], "row_bbox": [ 0, - 1840, + 2299, 1281, - 1916 + 2310 ] }, { @@ -286,9 +286,9 @@ "oddities": [], "row_bbox": [ 0, - 1916, + 2310, 1281, - 1992 + 2310 ] }, { @@ -300,9 +300,9 @@ "oddities": [], "row_bbox": [ 0, - 1992, + 2310, 1281, - 2068 + 2310 ] }, { @@ -314,9 +314,9 @@ "oddities": [], "row_bbox": [ 0, - 2068, + 2310, 1281, - 2144 + 2310 ] } ], @@ -344,9 +344,9 @@ "oddities": [], "row_bbox": [ 1281, - 2159, + 2057, 2550, - 2172 + 2159 ] }, { @@ -358,9 +358,9 @@ "oddities": [], "row_bbox": [ 1281, - 2172, + 2159, 2550, - 2185 + 2261 ] }, { @@ -372,9 +372,9 @@ "oddities": [], "row_bbox": [ 1281, - 2185, + 2261, 2550, - 2198 + 2310 ] }, { @@ -386,9 +386,9 @@ "oddities": [], "row_bbox": [ 1281, - 2198, + 2310, 2550, - 2211 + 2310 ] }, { @@ -400,9 +400,9 @@ "oddities": [], "row_bbox": [ 1281, - 2211, + 2310, 2550, - 2224 + 2310 ] }, { @@ -414,9 +414,9 @@ "oddities": [], "row_bbox": [ 1281, - 2224, + 2310, 2550, - 2237 + 2310 ] }, { @@ -430,9 +430,9 @@ ], "row_bbox": [ 1281, - 2237, + 2310, 2550, - 2250 + 2310 ] }, { @@ -444,9 +444,9 @@ "oddities": [], "row_bbox": [ 1281, - 2250, + 2310, 2550, - 2263 + 2310 ] }, { @@ -458,9 +458,9 @@ "oddities": [], "row_bbox": [ 1281, - 2263, + 2310, 2550, - 2276 + 2310 ] }, { @@ -472,9 +472,9 @@ "oddities": [], "row_bbox": [ 1281, - 2276, + 2310, 2550, - 2289 + 2310 ] }, { @@ -486,9 +486,9 @@ "oddities": [], "row_bbox": [ 1281, - 2289, + 2310, 2550, - 2302 + 2310 ] } ], diff --git a/core/page_layout.py b/core/page_layout.py index 7ea4cff..b67d5fe 100644 --- a/core/page_layout.py +++ b/core/page_layout.py @@ -361,14 +361,21 @@ def partition_row_lines_by_quadrant( # Correction pass: on some pages `_detect_body_mid_y` lands BELOW the # bottom-block hour-jock-cell baseline (the anchor at 0.55h prefers the # gap below the cell over the true inter-block gap above it). The - # baseline line then gets misattributed to the top quadrant, and the + # baseline line then gets attributed only to the top quadrant, and the # bottom quadrant's first detected line is row 0's BOTTOM rather than - # its top — shifting every row crop up by one. + # its top — shifting every bottom-quadrant row crop up by one. # # Signal: the top quadrant's last spacing is significantly larger than # the median row spacing across all detected lines (a normal sequence # has consistent spacing; an anomalous jump at the end means the last - # line belongs to a different sequence — the bottom block). + # line is the printed grid line at the inter-block boundary). + # + # Resolution: that boundary line bounds BOTH bands and must appear in + # both partitions. Add it to the bottom quadrant WITHOUT removing it + # from the top: the top-quadrant cropper needs it as row N-1's bottom + # endpoint (without it, clean-pairing falls back to median-gap stepping + # anchored to lines[0], which on pages with a tall first row drifts + # below the true grid by half a row per step — Alex's page-34 report). if len(all_lines) >= 2: median_spacing = float(np.median(np.diff(np.asarray(all_lines)))) if median_spacing > 0: @@ -380,6 +387,6 @@ def partition_row_lines_by_quadrant( if len(top_lines) >= 2: last_spacing = top_lines[-1] - top_lines[-2] if last_spacing > _BOTTOM_BASELINE_REATTRIBUTION_RATIO * median_spacing: - moved = top_lines.pop() - out[bottom_pos].insert(0, moved) # type: ignore[index] + shared = top_lines[-1] + out[bottom_pos].insert(0, shared) # type: ignore[index] return out diff --git a/tests/unit/test_page_layout.py b/tests/unit/test_page_layout.py index 126d05d..c451734 100644 --- a/tests/unit/test_page_layout.py +++ b/tests/unit/test_page_layout.py @@ -299,32 +299,50 @@ def test_partition_row_lines_finds_content_in_top_band( assert total_top > 0, f"{stem}: no row lines detected in top band" -def test_partition_row_lines_reattributes_misclassified_bottom_baseline() -> None: - """When the top quadrant's last spacing is anomalously large (the line - is actually the hour-jock baseline of the bottom block, misattributed - because body_mid_y landed below it), it gets moved to the corresponding - bottom quadrant. - - Pages 20 and 25 of the 1990-04 golden set exhibit this: top_left's last - spacing is 100px vs median 75. The fix moves y≈2251 (page25) from - top_left to bottom_left. +def test_partition_row_lines_shares_boundary_line_with_bottom_block() -> None: + """When the top quadrant's last spacing is anomalously large, the trailing + line is the printed grid line that sits between row N of the top block + and the hour-jock cell of the bottom block. That same printed line + bounds BOTH bands and must appear in both partitions: + + - It is bottom_left's first line (where row 0's bbox needs it as its + top edge — without it, the bottom-quadrant crop slides up by one row). + - It also remains top_left's LAST line (where row N-1's bbox needs it + as its bottom edge — without it, every top-quadrant crop slides up + by half a row because clean-pairing falls back to median-gap + stepping anchored to lines[0], which on tall first-row pages produces + a fixed-step grid that drifts below the true row boundaries). + + Pages 20 and 25 of the 1990-04 golden set show this geometry: top_left's + last spacing is 100px vs median 75. The shared-line invariant must hold + on both sides. """ stem = "1990-04apr0106-page25" image = Image.open(GOLDEN_DIR / f"{stem}.png") layout = detect_page_layout(image) partitions = partition_row_lines_by_quadrant(image, layout) - # bottom_left must start with a line ABOVE body_mid_y (the reattributed - # hour-jock baseline). The original first line below body_mid_y was 2352; - # after reattribution, ~2251 should now be the new first line. - assert partitions["bottom_left"][0] < layout.body_mid_y, ( - f"expected first bottom_left line to be reattributed above body_mid_y, " - f"got {partitions['bottom_left'][0]} vs body_mid_y={layout.body_mid_y}" + # bottom_left must start with a line ABOVE body_mid_y (the shared + # boundary line). The original first line below body_mid_y was 2352; + # the shared line ~2251 should now be the new first line. + bottom_first = partitions["bottom_left"][0] + assert bottom_first < layout.body_mid_y, ( + f"expected first bottom_left line to be the shared boundary line above " + f"body_mid_y, got {bottom_first} vs body_mid_y={layout.body_mid_y}" ) - # And the spacing from the new first line to the next should be ~one row, + # And the spacing from the shared line to the next should be ~one row, # accounting for the hour-jock cell baseline at the top. - diff = partitions["bottom_left"][1] - partitions["bottom_left"][0] + diff = partitions["bottom_left"][1] - bottom_first assert 90 < diff < 115, f"unexpected first-row span: {diff}" + # The same y MUST also remain as top_left's last line — otherwise the + # top-quadrant cropper loses an endpoint and falls back to a fixed-step + # grid that drifts below the true row boundaries (the page-34 + # misalignment Alex flagged in the bbox-pathology sweep). + assert partitions["top_left"][-1] == bottom_first, ( + f"expected top_left's last line to be the shared boundary line " + f"{bottom_first}, got {partitions['top_left'][-1]}" + ) + def test_partition_row_lines_handles_blank_image() -> None: """A blank image returns four empty lists — no crash, no missing keys."""