diff --git a/CHANGELOG.md b/CHANGELOG.md index 9eac282..c67ce7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/). - Fixed `ComponentList.add_component` to honor an explicit `cmpt_rank` of `0` (the previous falsy check replaced rank `0` with the auto-counter) - Added characterization tests pinning every main-layout routing and extraction branch; package-wide simplification pass (dead-code removal, shared-helper reuse, hoisted regex compilation) with no behavior change - Parse whole-page knowledge-panel (`kp-wholepage`) tabs as a sub-column: when the panel has collapsed the main result column into its active tab, the tab's heterogeneous blocks (organics, `top_stories`, `videos`, knowledge cards, and specialized widgets) are each routed through the normal classify/parse pipeline instead of being emitted as one mislabeled component. Anchors on the stable `kp-wp-tab-cont-*` id and flattens grouping wrappers, recovering organics that were silently dropped on election/entity panels whose blocks nest under a wrapper the per-tab recipes could not reach (an Arizona-primaries-style panel went from 0 organics to the full column). The `standard-overview`/`standard-airfares` recipes still claim the panels they natively and fully handle (output unchanged); the sub-column model takes over only when a recipe under-extracts. Supersedes the interim `div.g`-only recovery +- Fixed a regression on *complementary* `kp-wholepage` panels — a music/entity panel that sits beside an intact organic column instead of collapsing it (e.g. `this is my life billy joel youtube`): the sub-column gate mis-keyed on `div.tF2Cxc` and dropped the 10 video organics, and the panel mistyped as `searches_related`/`or_give_general_feedback` from an inner "People also search for" heading. The collapse gate now recognizes any titled-link `div.g` (`h3` + `a[href]`) as organic, and both the header classifier and the knowledge sub-typer defer for `kp-wholepage-osrp` panels so they classify as `knowledge`/`panel`. The true collapsed case (`30 year mortgage rates`) is unaffected - Split bare-`tF2Cxc` organic bundles in `general` components into one result per organic (excluding People-Also-Ask sources), instead of collapsing them to a single result — recovers the organics on finance-style panels (e.g. an `aapl stock price` panel went from 1 to 9 organics) - Added `election_dates`, `election_results`, and `election_resources` component types for the election widgets embedded in whole-page election panels diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index ccc3198..2e73e55 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -61,6 +61,13 @@ class ClassifyMainHeader: @staticmethod def classify(cmpt, levels: tuple[int, ...] = (2, 3)) -> str: node: Node = cmpt + # A whole-page entity panel (``kp-wholepage-osrp``) can embed sub-carousels + # ("People also search for") and feedback affordances whose level-2 headings + # would mis-claim the entire panel (e.g. as ``searches_related``). Defer to the + # structural classifiers downstream (``available_on``, ``knowledge_panel``), + # which type these panels correctly. + if node.css_first("div.kp-wholepage-osrp") is not None: + return "unknown" for level in levels: header = ClassifyMainHeader._classify_header(node, level) if header != "unknown": diff --git a/WebSearcher/component_parsers/knowledge.py b/WebSearcher/component_parsers/knowledge.py index 5a43416..6e0aa85 100644 --- a/WebSearcher/component_parsers/knowledge.py +++ b/WebSearcher/component_parsers/knowledge.py @@ -228,6 +228,13 @@ def _subtype_things_to_know(node: Node, parsed: dict, details: dict, h2_text: st def _subtype_dynamic_section(node: Node, parsed: dict, details: dict, h2_text: str) -> bool: if node.css_first("div.JNkvid") is None: return False + # A whole-page entity panel (kp-wholepage-osrp) is a generic "panel": an internal + # subcard (e.g. a 'People also search for' carousel, or a feedback affordance whose + # heading precedes it in document order) is a section of the panel, not the panel's + # defining sub_type. Defer to the ``_subtype_panel`` fallback so the whole component + # stays ``panel`` instead of inheriting a subcard/affordance heading. + if node.css_first("div.kp-wholepage-osrp") is not None: + return False section_heading = node.css_first('[role="heading"][aria-level="2"]') if section_heading is None: # JNkvid without a section heading falls through to the panel fallback. diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index b53d29a..0349b8d 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -278,12 +278,28 @@ def _kp_subcolumn(kp: Node) -> list[Node]: @staticmethod def _kp_organics_outside(rso_div: Node, kp: Node) -> bool: - """True if any ``div.g`` organic (``div.tF2Cxc`` inside) sits in ``#rso`` - but outside the panel -- the tell that the panel is a *complementary* side - panel, not the collapsed main column.""" + """True if any organic ``div.g`` sits in ``#rso`` but outside the panel -- + the tell that the panel is a *complementary* side panel, not the collapsed + main column. + + An organic is any ``div.g`` carrying a titled link (an ``h3`` and an + ``a[href]``), not just the classic ``div.tF2Cxc`` text result: a music/video + panel renders its organic video results as ``div.g`` siblings of the panel + (a YouTube ``div.g`` has the title+link but no inner ``div.tF2Cxc``). Keying + the guard on ``div.tF2Cxc`` alone missed those, so the gate misfired and the + sub-column path discarded the video organics. ``h3`` + ``a[href]`` matches + both shapes while still excluding chrome ``div.g`` wrappers. + + Assumption: a titled-link ``div.g`` is an *organic*, i.e. the tell of a + complementary panel. A genuinely collapsed panel that nonetheless renders + some titled-link ``div.g`` (a PAA block, a video-carousel item, an image + card with a heading link) *outside* the ``kp`` in ``#rso`` would wrongly + read as complementary here and skip the sub-column collapse. No such shape + is known (the collapsed "30 year mortgage rates" fixture keeps its organics + inside the panel); revisit this test if one surfaces.""" kp_id = kp.mem_id for g in subtree_css(rso_div, "div.g"): - if g.css_first("div.tF2Cxc") is None: + if g.css_first("h3") is None or g.css_first("a[href]") is None: continue node, inside = g, False while node is not None and node.mem_id != rso_div.mem_id: diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3917a73cbceb].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3917a73cbceb].json new file mode 100644 index 0000000..a3ba232 --- /dev/null +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3917a73cbceb].json @@ -0,0 +1,268 @@ +{ + "features": { + "captcha": false, + "infinity_scroll": true, + "language": "en", + "main_layout": "standard", + "notice_no_results": false, + "notice_server_error": false, + "notice_shortened_query": false, + "overlay_precise_location": false, + "result_estimate_count": 8540000.0, + "result_estimate_time": 0.48 + }, + "results": [ + { + "cite": "22.9M+ views · 14 years ago", + "cmpt_rank": 0, + "details": { + "duration": "5:41", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 0, + "sub_rank": 0, + "sub_type": "video", + "text": "Official HD musicvideofor \"My Life\" byBilly JoelListen toBilly Joel: https://billyjoel.lnk.to/listenYD Subscribe to theBilly Joel...", + "title": "Billy Joel - My Life (Official Video) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=h3JFEfdK_Ls" + }, + { + "cite": null, + "cmpt_rank": 1, + "details": { + "heading": "Choose what you’re giving feedback on", + "img_url": null, + "text": "Choose what you’re giving feedback on|general feedback|My Life|Song by Billy Joel|Song by Billy Joel|Lyrics|Videos|Listen|iHeartRadio|iHeartRadio|Deezer|Deezer|People also search for|People also search for|People also search for|The River of Dreams|Billy Joel|It's Still Rock and Roll to Me|Billy Joel|She's Always a Woman|Billy Joel|Pressure|Billy Joel|See all", + "type": "panel", + "urls": [ + { + "text": "iHeartRadio", + "url": "https://www.iheart.com/artist/billy-joel-4684/songs/my-life-163680770/?autoplay=true" + }, + { + "text": "Deezer", + "url": "https://www.deezer.com/track/15179460" + }, + { + "text": "The River of Dreams|Billy Joel", + "url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rO4P19IF_yO85wYpkEaz-W_oZWd5JUOOVnUVftf2aeofFP9VwCCA6Uqaikfh-92GALnRq-UPURedMcT0lWCLe3ziKnjpQjdAKY0L7ZI9W4TxjJkH0-Ws0s0Q5wRWqpr-h7J_NSMc9pu69Nt0K4bXsD10Tdag6PuLT59EQs4tTA9haVfss7bCN05GWy2CSjcl3FML8PlA6gAHBDicp4l858JXuMRoZgDoVrL9uCGM5ATRyU_n4%3D&q=billy+joel+the+river+of+dreams&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAB6BAgdEAI" + }, + { + "text": "It's Still Rock and Roll to Me|Billy Joel", + "url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rO4P19IF_yO85wYpkEaz-W_oZWd5JUOOVnUVftf2aeoWz1Wedhm8th9fOZsRFuLTXs_Ta_IoH8SH7My22Pq8cbOYumb7DW2V2K1CnsuoifO2ihWhwl36OZcogZGvimjEtNh_aDWGNphEh8EdSJ_KTHETs6dbWLlhnOWbtflpbjckCncG1qT2XTRz3Jdg-4n-rgorCGenl8umHipXHK6EaNlAQKz_YkwNF4QcWi6DuGQwpT960Psdwv1vdhBTo8pif2SFSu&q=billy+joel+it%27s+still+rock+and+roll+to+me&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAF6BAgdEAM" + }, + { + "text": "She's Always a Woman|Billy Joel", + "url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rNw0aV2xaMSajixiwS9lmNu5OlNFGzE_NenPRGAlzGM7dChKzB13VzQg5KJIStO84yXmiuqILspKMTxPcDkqrNCH2nE_Fpk4yq5bnnLFTRdTh2IQGmFN1sM2Su0E1F2hBoFHVQxrJslNFTc5EyL-GIybbWypJypZVTRNhsAlsStfdqyCr0e185QtNj__jFQh-r0sxODt45HKj8KJBSVEBx-BPGdHnflUU63i1-O1SCo2wJFFQ%3D&q=billy+joel+she%27s+always+a+woman&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAJ6BAgdEAQ" + }, + { + "text": "Pressure|Billy Joel", + "url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rNw0aV2xaMSajixiwS9lmNu5OlNFGzE_NenPRGAlzGM1jiGw7P8U8fGFWww5k3cwcZKmLpxhfBy7Ta-eg5ARgwWlf3FMEDNSt-3xkFwrbp_2QcjcHkAXjdpW9yGZri85ZSUq-W30lUxPQDnSda27tqK1VX_kM8ZIPp6l1WeYlLPXi1kszB1Z6h2cg0QvUdOLIo3CVE3qrJvIBE9Vs4ojgemkx9lYDZJWMHocqx5uLTdtR6YEc%3D&q=billy+joel+pressure&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAN6BAgdEAU" + } + ] + }, + "error": null, + "section": "main", + "serp_rank": 1, + "sub_rank": 0, + "sub_type": "panel", + "text": null, + "title": null, + "type": "knowledge", + "url": null + }, + { + "cite": "8.1M+ views · 10 years ago", + "cmpt_rank": 2, + "details": { + "duration": "4:46", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 2, + "sub_rank": 0, + "sub_type": "video", + "text": "MusicvideobyBilly JoelperformingMy Life. (C) 2011 Sony Music Entertainment.", + "title": "Billy Joel - My Life (Audio) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=HVX80UpMPDI" + }, + { + "cite": "11.4M+ views · 8 years ago", + "cmpt_rank": 3, + "details": { + "duration": "4:44", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 3, + "sub_rank": 0, + "sub_type": "video", + "text": "Provided toYouTubeby ColumbiaMy Life·Billy Joel52nd Street ℗ 1978 Columbia Records, a division of Sony Music Entertainment Released ...", + "title": "My Life - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=NlvU-EHk4Nc" + }, + { + "cite": "618.2K+ views · 5 years ago", + "cmpt_rank": 4, + "details": { + "duration": "4:18", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 4, + "sub_rank": 0, + "sub_type": "video", + "text": "Billy Joel“My Life\" Live from Long Island Listen toBilly Joel: https://billyjoel.lnk.to/playlist!MyLifeFollowBilly Joel: Facebook: ...", + "title": "Billy Joel - My Life (Live from Long Island) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=Tka4DQGx7zc" + }, + { + "cite": "3K+ views · 1 year ago", + "cmpt_rank": 5, + "details": { + "duration": "4:26", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 5, + "sub_rank": 0, + "sub_type": "video", + "text": "\"My Life\" is asongbyBilly Joelthat first appeared on his 1978 album 52nd Street. A single version was released in the fall of 1978 and ...", + "title": "Billy Joel ~ My Life (1978) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=l_gDOtI1Tdw" + }, + { + "cite": "4.5K+ views · 1 year ago", + "cmpt_rank": 6, + "details": { + "duration": "4:40", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 6, + "sub_rank": 0, + "sub_type": "video", + "text": "Subscribe and press ( ) to join the Notification Squad and stay updated with new uploads and DROP A COMMENT. [Instrumental Intro] [Verse 1] ...", + "title": "My Life - Billy Joel (Lyrics) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=lzB2n2jr71Q" + }, + { + "cite": "5.3M+ views · 10 years ago", + "cmpt_rank": 7, + "details": { + "duration": "5:59", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 7, + "sub_rank": 0, + "sub_type": "video", + "text": "... Joel Listen toBilly Joel: https://billyjoel.lnk.to/listenYD Subscribe to theBilly Joel YouTubeChannel: https://billyjoel.lnk.to/subscribe ...", + "title": "Billy Joel - My Life (Live From The River Of Dreams Tour)", + "type": "general", + "url": "https://www.youtube.com/watch?v=Pm6hEU90G3A" + }, + { + "cite": "85.3K+ views · 7 years ago", + "cmpt_rank": 8, + "details": { + "duration": "4:42", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 8, + "sub_rank": 0, + "sub_type": "video", + "text": "Provided toYouTubeby ColumbiaMy Life·Billy JoelThe EssentialBilly Joel℗ 1978 Columbia Records, a division of Sony Music ...", + "title": "Billy Joel - My Life - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=zQra7WumCTQ" + }, + { + "cite": "33.9K+ views · 3 years ago", + "cmpt_rank": 9, + "details": { + "duration": "4:40", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 9, + "sub_rank": 0, + "sub_type": "video", + "text": "Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life...", + "title": "Billy Joel - My Life (Lyrics) - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=sA7OcLYZ2qk" + }, + { + "cite": "1.7M+ views · 12 years ago", + "cmpt_rank": 10, + "details": { + "duration": "4:46", + "source": null, + "type": "video" + }, + "error": null, + "section": "main", + "serp_rank": 10, + "sub_rank": 0, + "sub_type": "video", + "text": "My Lifefrom The EssentialBilly Joelalbum. Thissongis for entertainment purposes only.", + "title": "My Life - Billy Joel - YouTube", + "type": "general", + "url": "https://www.youtube.com/watch?v=HYayK5uEvfo" + }, + { + "cite": null, + "cmpt_rank": 11, + "details": { + "heading": "Related searches", + "items": [ + "billy joel - my life meaning", + "my life lyrics", + "this is my life - song 80s", + "billy joel - my life (live)", + "my life beatles", + "who sings this is my life song", + "billy joel - my life lyrics", + "all my life" + ], + "type": "text" + }, + "error": null, + "section": "footer", + "serp_rank": 11, + "sub_rank": 0, + "sub_type": "related_searches", + "text": "billy joel - my life meaning<|>my life lyrics<|>this is my life - song 80s<|>billy joel - my life (live)<|>my life beatles<|>who sings this is my life song<|>billy joel - my life lyrics<|>all my life", + "title": null, + "type": "searches_related", + "url": null + } + ] +} diff --git a/tests/fixtures/serps.json.bz2 b/tests/fixtures/serps.json.bz2 index 253489c..fde41d4 100644 Binary files a/tests/fixtures/serps.json.bz2 and b/tests/fixtures/serps.json.bz2 differ