Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
- Fixed `ComponentList.add_component` to honor an explicit `cmpt_rank` of `0` (the previous falsy check replaced rank `0` with the auto-counter)
- Added characterization tests pinning every main-layout routing and extraction branch; package-wide simplification pass (dead-code removal, shared-helper reuse, hoisted regex compilation) with no behavior change
- Parse whole-page knowledge-panel (`kp-wholepage`) tabs as a sub-column: when the panel has collapsed the main result column into its active tab, the tab's heterogeneous blocks (organics, `top_stories`, `videos`, knowledge cards, and specialized widgets) are each routed through the normal classify/parse pipeline instead of being emitted as one mislabeled component. Anchors on the stable `kp-wp-tab-cont-*` id and flattens grouping wrappers, recovering organics that were silently dropped on election/entity panels whose blocks nest under a wrapper the per-tab recipes could not reach (an Arizona-primaries-style panel went from 0 organics to the full column). The `standard-overview`/`standard-airfares` recipes still claim the panels they natively and fully handle (output unchanged); the sub-column model takes over only when a recipe under-extracts. Supersedes the interim `div.g`-only recovery
- Fixed a regression on *complementary* `kp-wholepage` panels — a music/entity panel that sits beside an intact organic column instead of collapsing it (e.g. `this is my life billy joel youtube`): the sub-column gate mis-keyed on `div.tF2Cxc` and dropped the 10 video organics, and the panel mistyped as `searches_related`/`or_give_general_feedback` from an inner "People also search for" heading. The collapse gate now recognizes any titled-link `div.g` (`h3` + `a[href]`) as organic, and both the header classifier and the knowledge sub-typer defer for `kp-wholepage-osrp` panels so they classify as `knowledge`/`panel`. The true collapsed case (`30 year mortgage rates`) is unaffected
- Split bare-`tF2Cxc` organic bundles in `general` components into one result per organic (excluding People-Also-Ask sources), instead of collapsing them to a single result — recovers the organics on finance-style panels (e.g. an `aapl stock price` panel went from 1 to 9 organics)
- Added `election_dates`, `election_results`, and `election_resources` component types for the election widgets embedded in whole-page election panels

Expand Down
7 changes: 7 additions & 0 deletions WebSearcher/classifiers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ class ClassifyMainHeader:
@staticmethod
def classify(cmpt, levels: tuple[int, ...] = (2, 3)) -> str:
node: Node = cmpt
# A whole-page entity panel (``kp-wholepage-osrp``) can embed sub-carousels
# ("People also search for") and feedback affordances whose level-2 headings
# would mis-claim the entire panel (e.g. as ``searches_related``). Defer to the
# structural classifiers downstream (``available_on``, ``knowledge_panel``),
# which type these panels correctly.
if node.css_first("div.kp-wholepage-osrp") is not None:
return "unknown"
for level in levels:
header = ClassifyMainHeader._classify_header(node, level)
if header != "unknown":
Expand Down
7 changes: 7 additions & 0 deletions WebSearcher/component_parsers/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,13 @@ def _subtype_things_to_know(node: Node, parsed: dict, details: dict, h2_text: st
def _subtype_dynamic_section(node: Node, parsed: dict, details: dict, h2_text: str) -> bool:
if node.css_first("div.JNkvid") is None:
return False
# A whole-page entity panel (kp-wholepage-osrp) is a generic "panel": an internal
# subcard (e.g. a 'People also search for' carousel, or a feedback affordance whose
# heading precedes it in document order) is a section of the panel, not the panel's
# defining sub_type. Defer to the ``_subtype_panel`` fallback so the whole component
# stays ``panel`` instead of inheriting a subcard/affordance heading.
if node.css_first("div.kp-wholepage-osrp") is not None:
return False
section_heading = node.css_first('[role="heading"][aria-level="2"]')
if section_heading is None:
# JNkvid without a section heading falls through to the panel fallback.
Expand Down
24 changes: 20 additions & 4 deletions WebSearcher/extractors/extractor_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,28 @@ def _kp_subcolumn(kp: Node) -> list[Node]:

@staticmethod
def _kp_organics_outside(rso_div: Node, kp: Node) -> bool:
"""True if any ``div.g`` organic (``div.tF2Cxc`` inside) sits in ``#rso``
but outside the panel -- the tell that the panel is a *complementary* side
panel, not the collapsed main column."""
"""True if any organic ``div.g`` sits in ``#rso`` but outside the panel --
the tell that the panel is a *complementary* side panel, not the collapsed
main column.

An organic is any ``div.g`` carrying a titled link (an ``h3`` and an
``a[href]``), not just the classic ``div.tF2Cxc`` text result: a music/video
panel renders its organic video results as ``div.g`` siblings of the panel
(a YouTube ``div.g`` has the title+link but no inner ``div.tF2Cxc``). Keying
the guard on ``div.tF2Cxc`` alone missed those, so the gate misfired and the
sub-column path discarded the video organics. ``h3`` + ``a[href]`` matches
both shapes while still excluding chrome ``div.g`` wrappers.

Assumption: a titled-link ``div.g`` is an *organic*, i.e. the tell of a
complementary panel. A genuinely collapsed panel that nonetheless renders
some titled-link ``div.g`` (a PAA block, a video-carousel item, an image
card with a heading link) *outside* the ``kp`` in ``#rso`` would wrongly
read as complementary here and skip the sub-column collapse. No such shape
is known (the collapsed "30 year mortgage rates" fixture keeps its organics
inside the panel); revisit this test if one surfaces."""
kp_id = kp.mem_id
for g in subtree_css(rso_div, "div.g"):
if g.css_first("div.tF2Cxc") is None:
if g.css_first("h3") is None or g.css_first("a[href]") is None:
continue
node, inside = g, False
while node is not None and node.mem_id != rso_div.mem_id:
Expand Down
268 changes: 268 additions & 0 deletions tests/__snapshots__/test_parse_serp/test_parse_serp[3917a73cbceb].json
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
{
"features": {
"captcha": false,
"infinity_scroll": true,
"language": "en",
"main_layout": "standard",
"notice_no_results": false,
"notice_server_error": false,
"notice_shortened_query": false,
"overlay_precise_location": false,
"result_estimate_count": 8540000.0,
"result_estimate_time": 0.48
},
"results": [
{
"cite": "22.9M+ views · 14 years ago",
"cmpt_rank": 0,
"details": {
"duration": "5:41",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 0,
"sub_rank": 0,
"sub_type": "video",
"text": "Official HD musicvideofor \"My Life\" byBilly JoelListen toBilly Joel: https://billyjoel.lnk.to/listenYD Subscribe to theBilly Joel...",
"title": "Billy Joel - My Life (Official Video) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=h3JFEfdK_Ls"
},
{
"cite": null,
"cmpt_rank": 1,
"details": {
"heading": "Choose what you’re giving feedback on",
"img_url": null,
"text": "Choose what you’re giving feedback on|general feedback|My Life|Song by Billy Joel|Song by Billy Joel|Lyrics|Videos|Listen|iHeartRadio|iHeartRadio|Deezer|Deezer|People also search for|People also search for|People also search for|The River of Dreams|Billy Joel|It's Still Rock and Roll to Me|Billy Joel|She's Always a Woman|Billy Joel|Pressure|Billy Joel|See all",
"type": "panel",
"urls": [
{
"text": "iHeartRadio",
"url": "https://www.iheart.com/artist/billy-joel-4684/songs/my-life-163680770/?autoplay=true"
},
{
"text": "Deezer",
"url": "https://www.deezer.com/track/15179460"
},
{
"text": "The River of Dreams|Billy Joel",
"url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rO4P19IF_yO85wYpkEaz-W_oZWd5JUOOVnUVftf2aeofFP9VwCCA6Uqaikfh-92GALnRq-UPURedMcT0lWCLe3ziKnjpQjdAKY0L7ZI9W4TxjJkH0-Ws0s0Q5wRWqpr-h7J_NSMc9pu69Nt0K4bXsD10Tdag6PuLT59EQs4tTA9haVfss7bCN05GWy2CSjcl3FML8PlA6gAHBDicp4l858JXuMRoZgDoVrL9uCGM5ATRyU_n4%3D&q=billy+joel+the+river+of+dreams&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAB6BAgdEAI"
},
{
"text": "It's Still Rock and Roll to Me|Billy Joel",
"url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rO4P19IF_yO85wYpkEaz-W_oZWd5JUOOVnUVftf2aeoWz1Wedhm8th9fOZsRFuLTXs_Ta_IoH8SH7My22Pq8cbOYumb7DW2V2K1CnsuoifO2ihWhwl36OZcogZGvimjEtNh_aDWGNphEh8EdSJ_KTHETs6dbWLlhnOWbtflpbjckCncG1qT2XTRz3Jdg-4n-rgorCGenl8umHipXHK6EaNlAQKz_YkwNF4QcWi6DuGQwpT960Psdwv1vdhBTo8pif2SFSu&q=billy+joel+it%27s+still+rock+and+roll+to+me&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAF6BAgdEAM"
},
{
"text": "She's Always a Woman|Billy Joel",
"url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rNw0aV2xaMSajixiwS9lmNu5OlNFGzE_NenPRGAlzGM7dChKzB13VzQg5KJIStO84yXmiuqILspKMTxPcDkqrNCH2nE_Fpk4yq5bnnLFTRdTh2IQGmFN1sM2Su0E1F2hBoFHVQxrJslNFTc5EyL-GIybbWypJypZVTRNhsAlsStfdqyCr0e185QtNj__jFQh-r0sxODt45HKj8KJBSVEBx-BPGdHnflUU63i1-O1SCo2wJFFQ%3D&q=billy+joel+she%27s+always+a+woman&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAJ6BAgdEAQ"
},
{
"text": "Pressure|Billy Joel",
"url": "/search?sca_esv=52b056e1a97f92b3&si=AKbGX_rNw0aV2xaMSajixiwS9lmNu5OlNFGzE_NenPRGAlzGM1jiGw7P8U8fGFWww5k3cwcZKmLpxhfBy7Ta-eg5ARgwWlf3FMEDNSt-3xkFwrbp_2QcjcHkAXjdpW9yGZri85ZSUq-W30lUxPQDnSda27tqK1VX_kM8ZIPp6l1WeYlLPXi1kszB1Z6h2cg0QvUdOLIo3CVE3qrJvIBE9Vs4ojgemkx9lYDZJWMHocqx5uLTdtR6YEc%3D&q=billy+joel+pressure&sa=X&ved=2ahUKEwjaiKrknfCEAxWUF1kFHdI3AEkQs9oBKAN6BAgdEAU"
}
]
},
"error": null,
"section": "main",
"serp_rank": 1,
"sub_rank": 0,
"sub_type": "panel",
"text": null,
"title": null,
"type": "knowledge",
"url": null
},
{
"cite": "8.1M+ views · 10 years ago",
"cmpt_rank": 2,
"details": {
"duration": "4:46",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 2,
"sub_rank": 0,
"sub_type": "video",
"text": "MusicvideobyBilly JoelperformingMy Life. (C) 2011 Sony Music Entertainment.",
"title": "Billy Joel - My Life (Audio) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=HVX80UpMPDI"
},
{
"cite": "11.4M+ views · 8 years ago",
"cmpt_rank": 3,
"details": {
"duration": "4:44",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 3,
"sub_rank": 0,
"sub_type": "video",
"text": "Provided toYouTubeby ColumbiaMy Life·Billy Joel52nd Street ℗ 1978 Columbia Records, a division of Sony Music Entertainment Released ...",
"title": "My Life - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=NlvU-EHk4Nc"
},
{
"cite": "618.2K+ views · 5 years ago",
"cmpt_rank": 4,
"details": {
"duration": "4:18",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 4,
"sub_rank": 0,
"sub_type": "video",
"text": "Billy Joel“My Life\" Live from Long Island Listen toBilly Joel: https://billyjoel.lnk.to/playlist!MyLifeFollowBilly Joel: Facebook: ...",
"title": "Billy Joel - My Life (Live from Long Island) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=Tka4DQGx7zc"
},
{
"cite": "3K+ views · 1 year ago",
"cmpt_rank": 5,
"details": {
"duration": "4:26",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 5,
"sub_rank": 0,
"sub_type": "video",
"text": "\"My Life\" is asongbyBilly Joelthat first appeared on his 1978 album 52nd Street. A single version was released in the fall of 1978 and ...",
"title": "Billy Joel ~ My Life (1978) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=l_gDOtI1Tdw"
},
{
"cite": "4.5K+ views · 1 year ago",
"cmpt_rank": 6,
"details": {
"duration": "4:40",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 6,
"sub_rank": 0,
"sub_type": "video",
"text": "Subscribe and press ( ) to join the Notification Squad and stay updated with new uploads and DROP A COMMENT. [Instrumental Intro] [Verse 1] ...",
"title": "My Life - Billy Joel (Lyrics) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=lzB2n2jr71Q"
},
{
"cite": "5.3M+ views · 10 years ago",
"cmpt_rank": 7,
"details": {
"duration": "5:59",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 7,
"sub_rank": 0,
"sub_type": "video",
"text": "... Joel Listen toBilly Joel: https://billyjoel.lnk.to/listenYD Subscribe to theBilly Joel YouTubeChannel: https://billyjoel.lnk.to/subscribe ...",
"title": "Billy Joel - My Life (Live From The River Of Dreams Tour)",
"type": "general",
"url": "https://www.youtube.com/watch?v=Pm6hEU90G3A"
},
{
"cite": "85.3K+ views · 7 years ago",
"cmpt_rank": 8,
"details": {
"duration": "4:42",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 8,
"sub_rank": 0,
"sub_type": "video",
"text": "Provided toYouTubeby ColumbiaMy Life·Billy JoelThe EssentialBilly Joel℗ 1978 Columbia Records, a division of Sony Music ...",
"title": "Billy Joel - My Life - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=zQra7WumCTQ"
},
{
"cite": "33.9K+ views · 3 years ago",
"cmpt_rank": 9,
"details": {
"duration": "4:40",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 9,
"sub_rank": 0,
"sub_type": "video",
"text": "Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life(Lyrics)Billy Joel-My Life...",
"title": "Billy Joel - My Life (Lyrics) - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=sA7OcLYZ2qk"
},
{
"cite": "1.7M+ views · 12 years ago",
"cmpt_rank": 10,
"details": {
"duration": "4:46",
"source": null,
"type": "video"
},
"error": null,
"section": "main",
"serp_rank": 10,
"sub_rank": 0,
"sub_type": "video",
"text": "My Lifefrom The EssentialBilly Joelalbum. Thissongis for entertainment purposes only.",
"title": "My Life - Billy Joel - YouTube",
"type": "general",
"url": "https://www.youtube.com/watch?v=HYayK5uEvfo"
},
{
"cite": null,
"cmpt_rank": 11,
"details": {
"heading": "Related searches",
"items": [
"billy joel - my life meaning",
"my life lyrics",
"this is my life - song 80s",
"billy joel - my life (live)",
"my life beatles",
"who sings this is my life song",
"billy joel - my life lyrics",
"all my life"
],
"type": "text"
},
"error": null,
"section": "footer",
"serp_rank": 11,
"sub_rank": 0,
"sub_type": "related_searches",
"text": "billy joel - my life meaning<|>my life lyrics<|>this is my life - song 80s<|>billy joel - my life (live)<|>my life beatles<|>who sings this is my life song<|>billy joel - my life lyrics<|>all my life",
"title": null,
"type": "searches_related",
"url": null
}
]
}
Binary file modified tests/fixtures/serps.json.bz2
Binary file not shown.