From 22b40275a6d6753ad42b9d4ab6e30b5087686abe Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 01:10:22 +0000 Subject: [PATCH 1/8] Simplification pass: dead code, reuse, and efficiency cleanups Quality-only, behavior-preserving (296 passed, 66 snapshots unchanged, pyrefly clean). Findings from a 4-angle review (reuse/simplify/ efficiency/altitude) of the WebSearcher package: - classifiers/main.py knowledge_box: remove two dead first_text computations (splitlines + split fallback) that were unconditionally overwritten by the _iter_text_fragments walk; hoist the function-local import to module top. Flagged by all four review angles. - classifiers/main.py general: compute class_tokens only in the branch that uses it (skip the alloc on the classless path). - classifiers/footer.py: reorder the short-circuit loop to the natural run-then-check form. - component_parsers/general.py: hoist per-call re.compile in parse_ratings/parse_product to module constants, matching the existing _ARIA_*_RE pattern. - components.py: drop dead result_counter attribute; fix to_records loop var shadowing the Component class name. - utils.py get_domain: collapse to early returns, building only the domain string actually returned. Skipped (noted for follow-up): extractor_main standard-layout ladder-> table (hot path, subtle per-branch filtering, needs uncovered-layout pins first); the endswith('locations') header special-case (registry- field change is over-engineering for one marker); Component.get_metadata removal (public API on a published lib) and the cmpt_rank==0 guard (behavior-changing). --- WebSearcher/classifiers/footer.py | 2 +- WebSearcher/classifiers/main.py | 24 ++++-------------------- WebSearcher/component_parsers/general.py | 20 +++++++++++--------- WebSearcher/components.py | 3 +-- WebSearcher/utils.py | 10 +++------- 5 files changed, 20 insertions(+), 39 deletions(-) diff --git a/WebSearcher/classifiers/footer.py b/WebSearcher/classifiers/footer.py index 14d03ba..8a470a6 100644 --- a/WebSearcher/classifiers/footer.py +++ b/WebSearcher/classifiers/footer.py @@ -28,9 +28,9 @@ def classify(cmpt: Node) -> str: cmpt_type = "unknown" for classifier in classifier_list: + cmpt_type = classifier(node) if cmpt_type != "unknown": break - cmpt_type = classifier(node) # Fall back to main classifier if cmpt_type == "unknown": diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 4134b08..42f39b5 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -3,7 +3,7 @@ from selectolax.lexbor import LexborNode as Node from .. import logger -from .._slx import class_tokens, get_text +from .._slx import _iter_text_fragments, class_tokens, get_text from ..component_types import header_text_to_type log = logger.Logger().start(__name__) @@ -186,10 +186,10 @@ def general(cmpt) -> str: """Classify general components.""" node: Node = cmpt node_id = node.mem_id - cls = class_tokens(node) # bs4 distinguished "class" present vs absent via ``"class" in cmpt.attrs`` # -- preserve that distinction explicitly. if "class" in node.attributes: + cls = class_tokens(node) conditions = { "format-01": cls == ["g"], "format-02": ("g" in cls) and ("Ww4FFb" in cls), @@ -268,30 +268,14 @@ def knowledge_box(cmpt) -> str: condition["locations"] = node.css_first("div.zd2Jbb") is not None condition["events"] = node.css_first("g-card.URhAHe") is not None condition["jobs"] = node.css_first("g-card.cvoI5e") is not None - # bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank - # text fragment in the subtree. Use the _slx walker indirectly via - # iter_text_fragments-style filter. + # bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank text + # fragment in the subtree; ``_iter_text_fragments`` replicates stripped_strings. first_text: str | None = None - for s in (get_text(node) or "").splitlines(): - s2 = s.strip() - if s2: - first_text = s2 - break - if first_text is None: - # fallback: pull first non-whitespace fragment from text walker - text = get_text(node) or "" - first_text = text.strip().split()[0] if text.strip() else None - # Simpler & more faithful: replicate stripped_strings exactly via the - # _slx iter_text_fragments walker. - from .._slx import _iter_text_fragments - for raw in _iter_text_fragments(node): stripped = raw.strip() if stripped: first_text = stripped break - else: - first_text = None if first_text is not None: condition["covid_alert"] = first_text == "COVID-19 alert" for condition_type, conditions in condition.items(): diff --git a/WebSearcher/component_parsers/general.py b/WebSearcher/component_parsers/general.py index 971f445..66bf1c7 100644 --- a/WebSearcher/component_parsers/general.py +++ b/WebSearcher/component_parsers/general.py @@ -183,6 +183,10 @@ def _next_sibling_with_text(node: Node) -> Node | None: _ARIA_RATING_RE = re.compile(r"Rated\s+(\d+(?:\.\d+)?)\s+out of\s+(\d+)") _ARIA_REVIEWS_RE = re.compile(r"\(([\d,]+)\)\s*user reviews?") +_RATING_NUMERIC_RE = re.compile(r"^\d*[.]?\d*$") +_RATING_VOTES_RE = re.compile(r" vote[s]?| review[s]?") +_RATING_REVIEW_BY_RE = re.compile("Review by") +_PRODUCT_SPLIT_RE = re.compile("-|·") def parse_rating_aria_label(aria_label: str) -> dict: @@ -202,26 +206,24 @@ def parse_rating_aria_label(aria_label: str) -> dict: def parse_ratings(text) -> dict: text = [t.strip() for t in text] - numeric = re.compile(r"^\d*[.]?\d*$") rating = re.split("Rating: ", text[0])[-1] - details: dict = {"rating": float(rating)} if numeric.match(rating) else {"rating": rating} + details: dict = ( + {"rating": float(rating)} if _RATING_NUMERIC_RE.match(rating) else {"rating": rating} + ) if len(text) > 1: - str_match_0 = re.compile(" vote[s]?| review[s]?") - str_match_1 = re.compile("Review by") - if str_match_0.search(text[1]): - reviews = re.split(str_match_0, text[1])[0] + if _RATING_VOTES_RE.search(text[1]): + reviews = re.split(_RATING_VOTES_RE, text[1])[0] reviews = reviews.replace(",", "")[1:] # [1:] drops unicode char details["reviews"] = int(reviews) - elif str_match_1.search(text[1]): + elif _RATING_REVIEW_BY_RE.search(text[1]): details["reviews"] = 1 return details def parse_product(text: str) -> dict: - split_match = re.compile("-|·") - parts = re.split(split_match, text) + parts = re.split(_PRODUCT_SPLIT_RE, text) if len(parts) == 1: return {"price": parts[0].strip()[1:]} return {"price": parts[0].strip()[1:], "stock": parts[1].strip()[1:]} diff --git a/WebSearcher/components.py b/WebSearcher/components.py index 1fd5c8a..62d364c 100644 --- a/WebSearcher/components.py +++ b/WebSearcher/components.py @@ -40,7 +40,6 @@ def __init__( self.type = type self.cmpt_rank = cmpt_rank self.result_list: list[dict] = [] - self.result_counter = 0 def __str__(self) -> str: return str(vars(self)) @@ -227,4 +226,4 @@ def export_component_results(self): return results def to_records(self): - return [Component.to_dict() for Component in self.components] + return [cmpt.to_dict() for cmpt in self.components] diff --git a/WebSearcher/utils.py b/WebSearcher/utils.py index be0b2a5..e304967 100644 --- a/WebSearcher/utils.py +++ b/WebSearcher/utils.py @@ -129,13 +129,9 @@ def get_domain(url: str | None) -> str: if not url: return "" domain = tldextract.extract(url) - without_subdomain = ".".join([domain.domain, domain.suffix]) - with_subdomain = ".".join([domain.subdomain, domain.domain, domain.suffix]) - if domain.subdomain: - domain_str = without_subdomain if domain.subdomain == "www" else with_subdomain - else: - domain_str = without_subdomain - return domain_str + if domain.subdomain and domain.subdomain != "www": + return ".".join([domain.subdomain, domain.domain, domain.suffix]) + return ".".join([domain.domain, domain.suffix]) # Sessions --------------------------------------------------------------------- From 1cc17251c45a512d385fa93f3828d187f88d9593 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 01:32:20 +0000 Subject: [PATCH 2/8] Remove dead Component.get_metadata method Never called anywhere in the package, tests, or docs, and carried a mutable-default-arg smell (key_filter=[...]). Confirmed dead before removal. 296 passed, 66 snapshots unchanged, pyrefly clean. --- WebSearcher/components.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/WebSearcher/components.py b/WebSearcher/components.py index 62d364c..96c762a 100644 --- a/WebSearcher/components.py +++ b/WebSearcher/components.py @@ -47,9 +47,6 @@ def __str__(self) -> str: def to_dict(self) -> dict: return self.__dict__ - def get_metadata(self, key_filter=["section", "cmpt_rank"]) -> dict: - return {k: v for k, v in self.to_dict().items() if k in key_filter} - def classify_component(self, classify_type_func: Callable | None = None): """Classify the component type""" if classify_type_func: From 76c8874c87877ad1cc6e165e0d65d8f706f36011 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 01:36:19 +0000 Subject: [PATCH 3/8] Fix add_component to honor an explicit cmpt_rank of 0 add_component used `if not cmpt_rank` to detect a missing rank, but the sentinel is cmpt_rank=None -- so an explicit 0 was treated as missing and silently replaced by the counter. Use `is None` to match the documented sentinel. No internal caller passes cmpt_rank today (the only splat carries elem/section/type), so this is a no-op on current behavior and a correctness fix for external/future callers. Add two ComponentList.add_component tests: auto-increment from 0, and the explicit-zero pin (fails on the old `not cmpt_rank` form). 298 passed, 66 snapshots unchanged, pyrefly clean. --- WebSearcher/components.py | 2 +- tests/test_components.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/WebSearcher/components.py b/WebSearcher/components.py index 96c762a..2f7c8ef 100644 --- a/WebSearcher/components.py +++ b/WebSearcher/components.py @@ -146,7 +146,7 @@ def __iter__(self): def add_component(self, elem, section="unknown", type="unknown", cmpt_rank=None): """Add a component to the list of components""" - cmpt_rank = self.cmpt_rank_counter if not cmpt_rank else cmpt_rank + cmpt_rank = self.cmpt_rank_counter if cmpt_rank is None else cmpt_rank component = Component(elem, section, type, cmpt_rank) self.components.append(component) diff --git a/tests/test_components.py b/tests/test_components.py index 9d85e29..2e3bca4 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -7,7 +7,7 @@ """ from WebSearcher import utils -from WebSearcher.components import Component +from WebSearcher.components import Component, ComponentList def comp(inner: str): @@ -45,3 +45,19 @@ def test_select_parser_returns_callable_for_unknown_and_registered(): assert callable(unknown_c.select_parser()) general_c = Component(comp("x"), section="main", type="general") assert callable(general_c.select_parser()) + + +def test_add_component_auto_increments_rank_from_zero(): + cl = ComponentList() + cl.add_component(comp("a"), section="main") + cl.add_component(comp("b"), section="main") + assert [c.cmpt_rank for c in cl.components] == [0, 1] + + +def test_add_component_honors_explicit_zero_rank(): + # cmpt_rank=None is the "auto-assign" sentinel; an explicit 0 must be kept, + # not treated as falsy-missing (it would otherwise pick up the counter, 1). + cl = ComponentList() + cl.add_component(comp("a"), section="main") # auto rank 0, counter -> 1 + cl.add_component(comp("b"), section="main", cmpt_rank=0) + assert cl.components[1].cmpt_rank == 0 From 13fec5d34335bcf80b4425394a696598a1d4272b Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 02:30:33 +0000 Subject: [PATCH 4/8] Add main_layout field to parse_serp features output The main-section layout label was internal-only dispatch state on ExtractorMain. Surface it as features.main_layout so the SERP layout type is observable in parsed output (e.g. 'standard', 'standard-0'). - SERPFeatures: add main_layout: str | None = None - parse_serp: set features.main_layout from extractor.main_handler .layout_label after extraction (no FeatureExtractor<->Extractor coupling) - Regenerate 66 snapshots (additive: +1 key each; 62 standard, 3 standard-0, 1 standard-4) - Pin the wiring with test_features_expose_main_layout Additive output change; existing results consumers are unaffected. 299 passed, pyrefly clean. --- WebSearcher/models/features.py | 3 +++ WebSearcher/parsers.py | 9 ++++++--- .../test_parse_serp[01f85d1329ba].json | 1 + .../test_parse_serp[032572e185d3].json | 1 + .../test_parse_serp[0410241ce1e2].json | 1 + .../test_parse_serp[0d3fc3b49b76].json | 1 + .../test_parse_serp[0ed311025efc].json | 1 + .../test_parse_serp[130eba186e94].json | 1 + .../test_parse_serp[18eccfe8454e].json | 1 + .../test_parse_serp[2c0aa0bbcd0c].json | 1 + .../test_parse_serp[2d1b05a046b2].json | 1 + .../test_parse_serp[305b53af69be].json | 1 + .../test_parse_serp[30926d7c7ae9].json | 1 + .../test_parse_serp[30c5d6bdb650].json | 1 + .../test_parse_serp[39617f527744].json | 1 + .../test_parse_serp[3c03a4a2cb7c].json | 1 + .../test_parse_serp[3c09a0f0c92f].json | 1 + .../test_parse_serp[3f5efb1dc358].json | 1 + .../test_parse_serp[45b6e019bfa2].json | 1 + .../test_parse_serp[4c8d8d2f226c].json | 1 + .../test_parse_serp[53940e35cc92].json | 1 + .../test_parse_serp[56cbcf8cd4dc].json | 1 + .../test_parse_serp[56f2eab63e9d].json | 1 + .../test_parse_serp[5898b04fb534].json | 1 + .../test_parse_serp[6978d0cd767d].json | 1 + .../test_parse_serp[6aa70651b0cd].json | 1 + .../test_parse_serp[6dc5bc34ff55].json | 1 + .../test_parse_serp[6e206db14899].json | 1 + .../test_parse_serp[6e401e618433].json | 1 + .../test_parse_serp[7049404a2dd6].json | 1 + .../test_parse_serp[7333536d2911].json | 1 + .../test_parse_serp[7ad9715f3597].json | 1 + .../test_parse_serp[7b89c00120e3].json | 1 + .../test_parse_serp[7d76d3a83ebc].json | 1 + .../test_parse_serp[811a27f92284].json | 1 + .../test_parse_serp[82e35954f552].json | 1 + .../test_parse_serp[83b17a6a7750].json | 1 + .../test_parse_serp[8d1b75b71e7f].json | 1 + .../test_parse_serp[8e820f7b024f].json | 1 + .../test_parse_serp[8f98fa9c0bef].json | 1 + .../test_parse_serp[9101d12ab778].json | 1 + .../test_parse_serp[923a428c1c22].json | 1 + .../test_parse_serp[97404b7b7c61].json | 1 + .../test_parse_serp[984065877aad].json | 1 + .../test_parse_serp[9a7e39d95bf0].json | 1 + .../test_parse_serp[9ed1baa7715d].json | 1 + .../test_parse_serp[a6c881e003e2].json | 1 + .../test_parse_serp[a6c8fe7fe769].json | 1 + .../test_parse_serp[aa594f199c3d].json | 1 + .../test_parse_serp[b15c5131b06c].json | 1 + .../test_parse_serp[b186024ec98a].json | 1 + .../test_parse_serp[b2e1777bf0f2].json | 1 + .../test_parse_serp[be99c971b8f7].json | 1 + .../test_parse_serp[c48f8aa3f6da].json | 1 + .../test_parse_serp[c9ab650f5bda].json | 1 + .../test_parse_serp[cad43c3268a8].json | 1 + .../test_parse_serp[ce37f114963e].json | 1 + .../test_parse_serp[d1855fa9cd1c].json | 1 + .../test_parse_serp[d1ac0c4abb10].json | 1 + .../test_parse_serp[d920789249af].json | 1 + .../test_parse_serp[da9b4fce9ab0].json | 1 + .../test_parse_serp[dc5861b33dda].json | 1 + .../test_parse_serp[e71a1cb4cd70].json | 1 + .../test_parse_serp[e828d00dc1b3].json | 1 + .../test_parse_serp[eab14aa4ff5d].json | 1 + .../test_parse_serp[f006c9318116].json | 1 + .../test_parse_serp[f6fae1c9a96e].json | 1 + .../test_parse_serp[faa9c7c889db].json | 1 + tests/test_parse_serp.py | 13 +++++++++++++ 69 files changed, 88 insertions(+), 3 deletions(-) diff --git a/WebSearcher/models/features.py b/WebSearcher/models/features.py index b39b7fa..e4bd54f 100644 --- a/WebSearcher/models/features.py +++ b/WebSearcher/models/features.py @@ -13,3 +13,6 @@ class SERPFeatures(BaseModel): infinity_scroll: bool = False overlay_precise_location: bool = False captcha: bool = False + # Main-section layout label assigned during extraction, e.g. "standard", + # "standard-0". None when no layout was detected. + main_layout: str | None = None diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index d66200b..e6a3050 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -39,9 +39,12 @@ def parse_serp(serp: str | Node) -> dict: finally: raw_serp_html.reset(token) + # Forward raw HTML (when available) + soup so feature extraction takes the + # regex path and reuses the already-parsed soup for shared probes. The main + # layout label is internal to extraction, so surface it on the features here. + features = FeatureExtractor.extract_features(serp, soup=soup) + features.main_layout = extractor.main_handler.layout_label return { - # Forward raw HTML (when available) + soup so feature extraction takes - # the regex path and reuses the already-parsed soup for shared probes. - "features": FeatureExtractor.extract_features(serp, soup=soup).model_dump(), + "features": features.model_dump(), "results": results, } diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json index db631d1..0433592 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json index 720335a..c13b978 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json index e4cd6ec..ddd1019 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json index 9a00a24..75d1bd0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json index 6332f59..70ce070 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json index 112e573..2c914f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json index 301bd1f..0f21fd5 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json index a056de9..bb280e7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json index 258ba6a..1979fc1 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json index 398f17d..4f8cf8a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json index 192cb40..2bef2b3 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json @@ -3,6 +3,7 @@ "captcha": true, "infinity_scroll": true, "language": "en", + "main_layout": "standard-0", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json index 2b7c49a..9e98f20 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json index 30cff2f..c9fc124 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json index 3716b9f..1d18c6d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json index 0aaa079..6ba005d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json index 7c826c6..785d98d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json index f8fc1bc..486d7ed 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json index 3a4ff55..78904bc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json index 92705b1..a1399da 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json index 69009dc..3a781cc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json index 25298c1..109bae5 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json index 1bbb7f2..b024604 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json index 06552f8..9b911b3 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json index 4304134..5909a1b 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json index c7b6ae0..8ad94f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json index bf8765b..7393344 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json index ec9f822..f646ee1 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard-4", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json index 2cca1db..b743f89 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json index aab0e36..cfbfba7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json index f7bb471..108e1d2 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json index 1383a03..ea94cce 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json index e481ee1..ca0713c 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json index ac2e874..337ef87 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json index af2f4e5..bb830f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json index 3b1b738..5888a54 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json index 3d429f2..5062e85 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json index b787dba..337b9b0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json index 47f68a2..24318f2 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json index 440f751..139c721 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json index b037743..b0b0057 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json index 8ee64eb..a68e9cc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json index b5779df..4fdf4e3 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json index e779c2e..a8d2cae 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json index cdc6daa..fba156d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json index 7729a0e..2f0230c 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json index e582b27..32c7ae8 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json index 3338c46..0f97034 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json index b862585..c813c91 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json index 5c4fddd..449f12d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard-0", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json index d291dfe..8b30692 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json index 60a1808..f17ab51 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json index 6f36220..b0c38b6 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json index 652950f..b080037 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json index 0c348e3..d0dc815 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json index 82a02d0..755956a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json index c2470b0..93340b0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json index ff33c26..ca439fe 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json index 081a5f0..507bc53 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json index c7002b1..d353f98 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json index de7d590..fd794d8 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json index 3855578..cfb946a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json index 8fa54ca..51c8a84 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json index fb337e8..fe58b0e 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json index be7214f..9828634 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json index 30377e2..f64a0d1 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard-0", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json index ef290ec..18b3f5d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/test_parse_serp.py b/tests/test_parse_serp.py index 7dad1af..a61397c 100644 --- a/tests/test_parse_serp.py +++ b/tests/test_parse_serp.py @@ -174,3 +174,16 @@ def test_field_types(all_results): assert r["text"] is None or isinstance(r["text"], str) assert r["cite"] is None or isinstance(r["cite"], str) assert r["error"] is None or isinstance(r["error"], str) + + +def test_features_expose_main_layout(all_parsed_serps): + """Every SERP's features carries a str-or-None ``main_layout`` label, and + the witnessed fixture distribution (standard / standard-0 / standard-4) is + present -- pins the extractor->features wiring.""" + seen = set() + for serp in all_parsed_serps: + assert "main_layout" in serp["features"] + layout = serp["features"]["main_layout"] + assert layout is None or isinstance(layout, str) + seen.add(layout) + assert {"standard", "standard-0", "standard-4"} <= seen From 48193766ef8fec7f15066d46174bccc92fd9021a Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 02:57:34 +0000 Subject: [PATCH 5/8] Pin ExtractorMain layout branches; fix no-rso sec2 duplication Establish characterization tests for every layout_label outcome before any ladder refactor -- the fixtures only witness standard/standard-0/standard-4, leaving 7 of 10 observable labels untested. Adds pins for: - get_layout routing truth table (standard, no-rso, left-bar, top-bars, and the top-bars-over-left-bar precedence) - standard-0/1/2/4 sub-type dispatch + the standard-3 empty fallback label - top-bars-divs / top-bars-children - left-bar (pins the current document-wide TzHB6b scope) and no-rso Fix: in extract_from_no_rso the page-level div.WvKfwe.a3spGf (sec2) section was appended inside the 'for div in sec1' loop, so its content was emitted once per sec1 block. Hoist it out of the loop so it appears once per page. Unwitnessed branch -> no snapshot churn. 315 passed (+16 pins), 66 snapshots unchanged, pyrefly clean. --- WebSearcher/extractors/extractor_main.py | 7 +- tests/test_extractor_main.py | 187 ++++++++++++++++++++++- 2 files changed, 189 insertions(+), 5 deletions(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index cf0c8b1..756b13c 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -382,9 +382,10 @@ def extract_from_no_rso(self, drop_tags: set | None = None) -> list: out.append(div) else: out.extend(n for n in div.css("div.g") if n.mem_id != div.mem_id) - sec2 = self.soup.css_first("div.WvKfwe.a3spGf") - if sec2 is not None and class_tokens(sec2) == ["WvKfwe", "a3spGf"]: - out.extend(sec2.iter(include_text=True)) + # Page-level trailing section -- appended once, not per sec1 div. + sec2 = self.soup.css_first("div.WvKfwe.a3spGf") + if sec2 is not None and class_tokens(sec2) == ["WvKfwe", "a3spGf"]: + out.extend(sec2.iter(include_text=True)) return [ c for c in out diff --git a/tests/test_extractor_main.py b/tests/test_extractor_main.py index 002511f..174df1a 100644 --- a/tests/test_extractor_main.py +++ b/tests/test_extractor_main.py @@ -1,7 +1,9 @@ -"""Tests for ExtractorMain component validity filtering""" +"""Tests for ExtractorMain component validity filtering and layout dispatch.""" from WebSearcher import utils -from WebSearcher.extractors.extractor_main import ExtractorMain +from WebSearcher._slx import get_text +from WebSearcher.components import ComponentList +from WebSearcher.extractors.extractor_main import ExtractorMain, _find_all_with_class def comp(html: str): @@ -9,6 +11,31 @@ def comp(html: str): return utils.make_soup(f'
{html}
').css_first("div.wrap") +def _texts(nodes) -> list[str]: + return [(get_text(n) or "").strip() for n in nodes] + + +def _make_extractor(body: str, *, top_bars_css: str | None = None) -> ExtractorMain: + """Build an ExtractorMain over ``body`` with layout_divs primed as + ``get_layout`` would set them (rso + optional top-bars).""" + soup = utils.make_soup(f"{body}") + em = ExtractorMain(soup, ComponentList()) + em.layout_divs["rso"] = soup.css_first('div[id="rso"]') + em.layout_divs["left-bar"] = soup.css_first("div.OeVqAd") + rcnt = soup.css_first('div[id="rcnt"]') + em.layout_divs["top-bars"] = ( + _find_all_with_class(rcnt, top_bars_css) if (rcnt is not None and top_bars_css) else [] + ) + return em + + +def _layout_label(body: str) -> str | None: + soup = utils.make_soup(f"{body}") + em = ExtractorMain(soup, ComponentList()) + em.get_layout() + return em.layout_label + + # is_valid: bad-label / empty rejection ---------------------------------------- @@ -66,3 +93,159 @@ def test_is_valid_keeps_throttler_without_ulsxyf(): '
content
' ).css_first("div.other") assert ExtractorMain.is_valid(other) is True + + +# get_layout: routing-label truth table ---------------------------------------- +# Pins which of the four extractor keys get_layout selects for each (top-bars, +# left-bar, rso) combination. These are the registry keys; the chosen extractor +# may further mutate layout_label (see standard-*/top-bars-* tests below). + +_RCNT_TOPBAR = '
bar
' + + +def test_get_layout_label_standard_for_rso_only(): + assert _layout_label('
') == "standard" + + +def test_get_layout_label_no_rso_when_rso_absent(): + assert _layout_label("
nothing
") == "no-rso" + + +def test_get_layout_label_left_bar_takes_precedence_over_rso(): + # A left-bar marker wins even when rso is present -- rso is then never read. + assert _layout_label('
') == "left-bar" + + +def test_get_layout_label_top_bars_folds_into_standard_with_rso(): + assert _layout_label(f'{_RCNT_TOPBAR}
') == "standard" + + +def test_get_layout_label_top_bars_without_rso(): + assert _layout_label(_RCNT_TOPBAR) == "top-bars" + + +def test_get_layout_label_top_bars_takes_precedence_over_left_bar(): + # layouts dict order is top-bars, left-bar, standard, no-rso and the first + # truthy flag wins -- so a populated top-bar outranks a left-bar marker. + assert _layout_label(f'{_RCNT_TOPBAR}
') == "top-bars" + + +# extract_from_standard: standard-0/1/2/4 sub-type dispatch --------------------- +# standard-0 (overview) and standard-4 (AIRFARES) are witnessed by fixtures; +# standard-1 (Songs) and standard-2 (SportsStandings) are NOT, so they are +# pinned here before any ladder refactor. + + +def test_standard_0_overview_extracts_tzhb6b_children(): + em = _make_extractor( + '
' + '
ov 1
ov 2
' + "
" + ) + res = em.extract_from_standard() + assert em.layout_label == "standard-0" + assert _texts(res) == ["ov 1", "ov 2"] + + +def test_standard_1_songs_extracts_tab_children(): + em = _make_extractor( + '
' + '
tabwrap
' + '
song A
song B
' + "
" + ) + res = em.extract_from_standard() + assert em.layout_label == "standard-1" + assert _texts(res) == ["song A", "song B"] + + +def test_standard_2_sports_standings_extracts_tab_children(): + em = _make_extractor( + '
' + '
team 1
team 2
' + "
" + ) + res = em.extract_from_standard() + assert em.layout_label == "standard-2" + assert _texts(res) == ["team 1", "team 2"] + + +def test_standard_4_airfares_extracts_a6k0a_children(): + em = _make_extractor( + '
' + '
fare 1
fare 2
' + "
" + ) + res = em.extract_from_standard() + assert em.layout_label == "standard-4" + assert _texts(res) == ["fare 1", "fare 2"] + + +def test_standard_3_label_on_empty_rso_fallback(): + # No kp-wp-tab-* container matches and the generic extraction yields nothing, + # so the label settles on the standard-3 empty fallback. + em = _make_extractor('
') + res = em.extract_from_standard() + assert em.layout_label == "standard-3" + assert _texts(res) == [] + + +# extract_from_top_bar: top-bars-divs / top-bars-children ---------------------- + + +def test_top_bar_divs_when_rso_has_result_divs(): + body = ( + '
topbar content
' + '
result one
result two
' + ) + em = _make_extractor(body, top_bars_css="div.XqFnDf, div.M8OgIe") + res = em.extract_from_top_bar() + assert em.layout_label == "top-bars-divs" + assert _texts(res) == ["topbar content", "result one", "result two"] + + +def test_top_bar_children_when_rso_has_no_result_divs(): + body = ( + '
topbar content
' + '
plain child
' + ) + em = _make_extractor(body, top_bars_css="div.XqFnDf, div.M8OgIe") + res = em.extract_from_top_bar() + assert em.layout_label == "top-bars-children" + assert _texts(res) == ["topbar content", "plain child"] + + +# extract_from_left_bar / extract_from_no_rso ---------------------------------- + + +def test_left_bar_extracts_tzhb6b_document_wide(): + # NOTE pins current behavior: extraction is scoped to the whole document, not + # to the left-bar div, so it also captures div.TzHB6b inside rso. + em = _make_extractor( + '
leftbar marker
' + '
tz in rso
' + '
tz outside rso
' + ) + res = em.extract_from_left_bar() + assert _texts(res) == ["tz in rso", "tz outside rso"] + + +def test_no_rso_extracts_sec1_links(): + em = _make_extractor('
more1
') + res = em.extract_from_no_rso() + assert _texts(res) == ["more1"] + + +def test_no_rso_appends_sec2_once_per_page(): + # The trailing div.WvKfwe.a3spGf section is page-level, so its content must + # appear once regardless of how many sec1 blocks precede it (it previously + # re-appended once per sec1 div). + em = _make_extractor( + '
more1
' + '
more2
' + '
sec2 content
' + ) + res = em.extract_from_no_rso() + texts = _texts(res) + assert texts.count("sec2 content") == 1 + assert [t for t in texts if t != "sec2 content"] == ["more1", "more2"] From 18bdb64f79290796947300c7282d6279e5dd01c7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 03:33:01 +0000 Subject: [PATCH 6/8] Refactor standard-* layout ladder into a data-driven table The standard sub-layouts were a 4-branch if-ladder (~95 lines) whose branches differed only in data: detection container/selectors, extraction container, and which class token(s) to keep. The extract_top_divs preamble was copy-pasted 6x across the file. Collapse to a _StandardLayout table + two shared extraction shapes: - shape A (keep_tokens set): direct children with the first matching token, top_divs + main_divs returned unfiltered (standard-0, standard-4) - shape B (keep_tokens None): all text-inclusive children, then drop bad tags + empties from the combined column (standard-1, standard-2) Detection now iterates the same table instead of a parallel dict, so the detect/extract halves can no longer drift. Behavior-preserving: guarded by the characterization pins added last commit (standard-0/1/2/4 + standard-3 fallback). 315 passed, 66 snapshots unchanged, pyrefly clean. --- WebSearcher/extractors/extractor_main.py | 201 ++++++++++------------- 1 file changed, 86 insertions(+), 115 deletions(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index 756b13c..a98a95a 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Any from selectolax.lexbor import LexborNode as Node @@ -8,6 +9,57 @@ log = Logger().start(__name__) +@dataclass(frozen=True) +class _StandardLayout: + """A ``standard-*`` sub-layout recipe. + + ``detect_css`` locates the tab container and ``detect_sels`` is the set of + descendant selectors that must match for the layout to claim the page. + Extraction then reads ``extract_css`` (sometimes a different container than + detection) via one of two shapes: + + - ``keep_tokens`` set -> direct children carrying the first token that yields + any matches, concatenated with the top-divs *unfiltered*. + - ``keep_tokens`` is ``None`` -> all children (text-inclusive) concatenated + with the top-divs, then bad tags + empty nodes dropped. + """ + + detect_css: str + detect_sels: tuple[str, ...] + extract_css: str + keep_tokens: tuple[str, ...] | None + + +# Detection precedence follows insertion order (0, 1, 2, 4); standard-3 is the +# empty-rso fallback handled directly in ``extract_from_standard``. +_STANDARD_LAYOUTS: dict[str, _StandardLayout] = { + "standard-0": _StandardLayout( + detect_css='div[id="kp-wp-tab-overview"]', + detect_sels=("div.TzHB6b", "div.A6K0A"), + extract_css='div[id="kp-wp-tab-overview"]', + keep_tokens=("TzHB6b", "A6K0A"), + ), + "standard-1": _StandardLayout( + detect_css='div[id="kp-wp-tab-cont-Songs"][role="tabpanel"]', + detect_sels=("div",), + extract_css='div[id="kp-wp-tab-Songs"]', + keep_tokens=None, + ), + "standard-2": _StandardLayout( + detect_css='div[id="kp-wp-tab-SportsStandings"]', + detect_sels=("div",), + extract_css='div[id="kp-wp-tab-SportsStandings"]', + keep_tokens=None, + ), + "standard-4": _StandardLayout( + detect_css='div[id="kp-wp-tab-AIRFARES"]', + detect_sels=("div.A6K0A",), + extract_css='div[id="kp-wp-tab-AIRFARES"]', + keep_tokens=("A6K0A",), + ), +} + + def _filter_empty(nodes) -> list[Node]: return [n for n in nodes if n is not None and has_text(n)] @@ -127,36 +179,12 @@ def extract_from_standard(self, drop_tags: set | None = None) -> list: if rso_div is None: return [] drop_tags = drop_tags or {"script", "style", None} - standard_layouts = { - "standard-0": ( - rso_div.css_first('div[id="kp-wp-tab-overview"]'), - "div", - ["div.TzHB6b", "div.A6K0A"], - ), - "standard-1": ( - rso_div.css_first('div[id="kp-wp-tab-cont-Songs"][role="tabpanel"]'), - None, - None, - ), - "standard-2": ( - rso_div.css_first('div[id="kp-wp-tab-SportsStandings"]'), - None, - None, - ), - "standard-4": ( - rso_div.css_first('div[id="kp-wp-tab-AIRFARES"]'), - "div", - ["div.A6K0A"], - ), - } - for layout_name, (layout_div, check_tag, check_css_list) in standard_layouts.items(): - if layout_div is not None: - if check_tag and check_css_list: - for css in check_css_list: - if _find_all_with_class(layout_div, css, filter_empty=False): - return self._extract_from_standard_sub_type(layout_name) - elif _find_all_with_class(layout_div, "div", filter_empty=False): - return self._extract_from_standard_sub_type(layout_name) + for layout_name, spec in _STANDARD_LAYOUTS.items(): + container = rso_div.css_first(spec.detect_css) + if container is not None and any( + _find_all_with_class(container, sel, filter_empty=False) for sel in spec.detect_sels + ): + return self._extract_from_standard_sub_type(layout_name) top_divs = ( ExtractorMain.extract_top_divs( @@ -187,96 +215,39 @@ def _extract_from_standard_sub_type(self, sub_type: str = "") -> list: return [] log.debug(f"main_layout: {self.layout_label} (update)") - if self.layout_label == "standard-0": - column: list = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] + spec = _STANDARD_LAYOUTS[sub_type] + top_divs = ( + ExtractorMain.extract_top_divs( + self.layout_divs["top-bars"], rso=self.layout_divs["rso"] ) - tab_overview = rso_div.css_first('div[id="kp-wp-tab-overview"]') + or [] + ) + container = rso_div.css_first(spec.extract_css) + + if spec.keep_tokens is not None: + # Shape A: direct children carrying the first token that matches; + # result is top_divs + main_divs, returned unfiltered. main_divs: list[Node] = [] - if tab_overview is not None: - # recursive=False: direct children matching the class token. - main_divs = [ - c for c in tab_overview.iter(include_text=False) if "TzHB6b" in class_tokens(c) - ] - if not main_divs: - main_divs = [ - c - for c in tab_overview.iter(include_text=False) - if "A6K0A" in class_tokens(c) - ] - column.extend(top_divs) - column.extend(main_divs) + if container is not None: + children = list(container.iter(include_text=False)) + for token in spec.keep_tokens: + main_divs = [c for c in children if token in class_tokens(c)] + if main_divs: + break + column = top_divs + main_divs log.debug(f"main_components: {len(column):,}") return column - if self.layout_label == "standard-1": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - songs_div = rso_div.css_first('div[id="kp-wp-tab-Songs"]') - # bs4 ``list(songs_div.children)`` then filter ``.name not in {script,style}`` - # -- text nodes were dropped by the name filter (NavigableString.name is None - # which IS in {script,style,None} default... wait, not in {script,style}). - # The original kept text nodes here. To stay faithful, use include_text=True - # then filter by name; subsequent ``filter_empty_divs`` strips empties. - main_divs = list(songs_div.iter(include_text=True)) if songs_div is not None else [] - column.extend(top_divs) - column.extend(main_divs) - column = [ - d - for d in column - if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} - ] - column = _filter_empty(column) - return column - - if self.layout_label == "standard-2": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - sports_div = rso_div.css_first('div[id="kp-wp-tab-SportsStandings"]') - main_divs = list(sports_div.iter(include_text=True)) if sports_div is not None else [] - column.extend(top_divs) - column.extend(main_divs) - column = [ - d - for d in column - if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} - ] - column = _filter_empty(column) - return column - - if self.layout_label == "standard-4": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - tab_airfares = rso_div.css_first('div[id="kp-wp-tab-AIRFARES"]') - main_divs = ( - [c for c in tab_airfares.iter(include_text=False) if "A6K0A" in class_tokens(c)] - if tab_airfares is not None - else [] - ) - column.extend(top_divs) - column.extend(main_divs) - return column - - return [] + # Shape B: all children (text-inclusive) + top_divs, then drop bad tags + # and empty nodes from the combined column. + main_divs = list(container.iter(include_text=True)) if container is not None else [] + column = top_divs + main_divs + column = [ + d + for d in column + if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} + ] + return _filter_empty(column) def extract_from_top_bar(self, drop_tags: set | None = None) -> list: drop_tags = drop_tags or {"script", "style", None} From 2e7030d726810cbc5a75033447bc2d39521ff4b6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 03:42:03 +0000 Subject: [PATCH 7/8] Rename standard-* layout labels to observable names; add plan 030 Rename the opaque numeric standard-N labels to names derived from the observable kp-wp-tab-* container each detects (no guessing at content meaning): standard-0 -> standard-overview (kp-wp-tab-overview) standard-1 -> standard-songs (kp-wp-tab-Songs) standard-2 -> standard-sports-standings (kp-wp-tab-SportsStandings) standard-4 -> standard-airfares (kp-wp-tab-AIRFARES) standard-3 -> standard-fallback (empty-rso fallback, no tab) These surface in features.main_layout, so the rename updates 4 snapshots (the witnessed overview/airfares SERPs) and the pins. top-bars-divs/ children, left-bar, no-rso are already observable-derived and unchanged. Checked the codebase: the other 'standard' strings (ads.py, component_types .py, test_ads.py) are an unrelated ad/component sub_type, left untouched. Add docs/plans/030-main-layout-known-issues.md documenting the deferred layout issues (left-bar document-wide scope, standard-fallback dead body, left-bar/top-bars shadowing rso, layout_label dual role, dead defensive code) -- all blocked on capturing witnessed fixtures. 315 passed, 66 snapshots, pyrefly clean. --- WebSearcher/extractors/extractor_main.py | 15 ++- WebSearcher/models/features.py | 2 +- docs/plans/030-main-layout-known-issues.md | 125 ++++++++++++++++++ .../test_parse_serp[30926d7c7ae9].json | 2 +- .../test_parse_serp[6e401e618433].json | 2 +- .../test_parse_serp[b186024ec98a].json | 2 +- .../test_parse_serp[f6fae1c9a96e].json | 2 +- tests/test_extractor_main.py | 29 ++-- tests/test_parse_serp.py | 6 +- 9 files changed, 155 insertions(+), 30 deletions(-) create mode 100644 docs/plans/030-main-layout-known-issues.md diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index a98a95a..3e243e8 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -30,28 +30,29 @@ class _StandardLayout: keep_tokens: tuple[str, ...] | None -# Detection precedence follows insertion order (0, 1, 2, 4); standard-3 is the -# empty-rso fallback handled directly in ``extract_from_standard``. +# Names mirror the observable ``kp-wp-tab-*`` container each sub-layout detects; +# detection precedence follows insertion order. ``standard-fallback`` is the +# empty-rso fallback handled directly in ``extract_from_standard`` (no tab). _STANDARD_LAYOUTS: dict[str, _StandardLayout] = { - "standard-0": _StandardLayout( + "standard-overview": _StandardLayout( detect_css='div[id="kp-wp-tab-overview"]', detect_sels=("div.TzHB6b", "div.A6K0A"), extract_css='div[id="kp-wp-tab-overview"]', keep_tokens=("TzHB6b", "A6K0A"), ), - "standard-1": _StandardLayout( + "standard-songs": _StandardLayout( detect_css='div[id="kp-wp-tab-cont-Songs"][role="tabpanel"]', detect_sels=("div",), extract_css='div[id="kp-wp-tab-Songs"]', keep_tokens=None, ), - "standard-2": _StandardLayout( + "standard-sports-standings": _StandardLayout( detect_css='div[id="kp-wp-tab-SportsStandings"]', detect_sels=("div",), extract_css='div[id="kp-wp-tab-SportsStandings"]', keep_tokens=None, ), - "standard-4": _StandardLayout( + "standard-airfares": _StandardLayout( detect_css='div[id="kp-wp-tab-AIRFARES"]', detect_sels=("div.A6K0A",), extract_css='div[id="kp-wp-tab-AIRFARES"]', @@ -196,7 +197,7 @@ def extract_from_standard(self, drop_tags: set | None = None) -> list: col = top_divs + col col = [c for c in col if ExtractorMain.is_valid(c)] if not col: - self.layout_label = "standard-3" + self.layout_label = "standard-fallback" log.debug(f"main_layout: {self.layout_label} (update)") divs = _find_all_with_class(rso_div, 'div[id="kp-wp-tab-overview"]', filter_empty=False) col = [] diff --git a/WebSearcher/models/features.py b/WebSearcher/models/features.py index e4bd54f..490a9b4 100644 --- a/WebSearcher/models/features.py +++ b/WebSearcher/models/features.py @@ -14,5 +14,5 @@ class SERPFeatures(BaseModel): overlay_precise_location: bool = False captcha: bool = False # Main-section layout label assigned during extraction, e.g. "standard", - # "standard-0". None when no layout was detected. + # "standard-overview". None when no layout was detected. main_layout: str | None = None diff --git a/docs/plans/030-main-layout-known-issues.md b/docs/plans/030-main-layout-known-issues.md new file mode 100644 index 0000000..c130d3b --- /dev/null +++ b/docs/plans/030-main-layout-known-issues.md @@ -0,0 +1,125 @@ +--- +status: draft +branch: claude/simplify-websearcher-pkg +created: 2026-05-31T00:00:00-07:00 +completed: +pr: +--- + +# Main-Section Layout: Known Issues & Follow-ups + +## Status + +Draft / notes. Captures the layout-dispatch issues surfaced while making +`ExtractorMain.layout_label` observable (added as `features.main_layout`), +pinning every layout branch, refactoring the `standard-*` ladder into a +data-driven table, and renaming the opaque `standard-N` labels. + +What already landed on `claude/simplify-websearcher-pkg`: + +- `features.main_layout` exposes the main-section layout label in parsed output. +- Characterization pins in `tests/test_extractor_main.py` cover every observable + `layout_label` outcome (the SERP fixtures only witness `standard`, + `standard-overview`, `standard-airfares`). +- `extract_from_standard` / `_extract_from_standard_sub_type` collapsed into a + `_StandardLayout` table + two extraction shapes. +- `no-rso` `sec2` duplication bug fixed (was appended once per `sec1` div). +- `standard-N` renamed to observable, `kp-wp-tab-*`-derived names + (`standard-overview`/`-songs`/`-sports-standings`/`-airfares`) and the + bolted-on fallback renamed `standard-fallback`. + +This doc tracks what was deliberately **not** changed, because the fixes touch +extraction branches no fixture exercises and would be guesses without a real +witnessed SERP. + +## Layout label inventory + +Routing keys (`get_layout` -> `layout_extractors`): `standard`, `top-bars`, +`left-bar`, `no-rso`. Final observable values after extractor mutation: + +| Value | Witnessed in fixtures | +|---|---| +| `standard` | yes (62) | +| `standard-overview` | yes (3) | +| `standard-airfares` | yes (1) | +| `standard-songs` | no (pinned synthetically) | +| `standard-sports-standings` | no (pinned synthetically) | +| `standard-fallback` | no (pinned synthetically) | +| `top-bars-divs` | no | +| `top-bars-children` | no | +| `left-bar` | no | +| `no-rso` | no (extraction path pinned synthetically) | + +`top-bars` is never a final value (always mutates to `-divs`/`-children`). +`None` is unreachable from `get_layout`. + +## Open issues + +### 1. `left-bar` extraction is scoped to the whole document + +`extract_from_left_bar` returns `subtree_css(self.soup, "div.TzHB6b")` over the +entire page, ignoring `layout_divs["left-bar"]` (the `div.OeVqAd` it detected +on). `div.TzHB6b` is **not** a left-bar marker -- it is a generic knowledge-panel +container also keyed on by `standard-overview`, the `standard-fallback` path, and +the RHS knowledge-panel detector (`extractor_rhs.py`). So detection and +extraction target unrelated things, and extraction reaches across sections. + +- **Faithful port, not a regression.** Git history shows the bs4 original was + literally `self.soup.find_all("div", {"class": "TzHB6b"})` -- document-wide + from the start. +- **Likely benign today:** the registry is 1:1 (in a `left-bar` layout, this is + the only main extractor that runs, so there is no second pass to duplicate + against), and RHS panels are `remove()`d from the tree before main extraction, + so the cross-section bleed is largely closed in the full pipeline. The unit + pin (`test_left_bar_extracts_tzhb6b_document_wide`) documents the raw + document-wide reach in isolation. +- **Why deferred:** zero `left-bar` fixtures. Narrowing the scope to + `subtree_css(left_bar_div, "div.TzHB6b")` could just as easily break a real + left-bar SERP as fix one, with no witness either way. +- **To resolve:** capture a real `left-bar` SERP, snapshot it, then decide + scope as a one-line, fully-witnessed change. + +### 2. `standard-fallback` extraction body is effectively dead + +The fallback re-targets the same `kp-wp-tab-overview` + `TzHB6b`/`A6K0A` that +`standard-overview` already gates on and checks first. Any content the fallback +could find, `standard-overview` already caught -- so the label only persists on +an essentially empty `rso`, with its extraction body unreachable (consistent +with 0 fixtures and the pin asserting an empty result). + +- **To resolve:** confirm against a witnessed empty-`rso` SERP, then either drop + the dead extraction body (keep the label as a pure "nothing extracted" signal) + or delete the branch entirely if the label has no consumer value. + +### 3. `left-bar` / `top-bars` shadow a populated `rso` + +In `get_layout`, when a `left-bar` (or populated `top-bars` with no rso) is +present, the label resolves away from `standard` and `rso` is never read. A page +carrying both a left-bar/top-bars marker **and** a populated `rso` would drop the +`rso` results. Whether that combination occurs in the wild is unknown (no +fixture). The routing truth table is pinned in `test_extractor_main.py` +(`test_get_layout_label_*`), so the current precedence is at least locked. + +### 4. `layout_label` has a dual role (routing key + result descriptor) + +`layout_label` is both the dispatch key (`get_layout` -> `layout_extractors`) +and the mutated result descriptor (`standard-*`, `top-bars-*`). The mapping is +not 1:1 (`standard` -> 6 outcomes, `top-bars` -> 2), and the registry is keyed +on the pre-mutation value while output reports the post-mutation one. The table +refactor reduced the fragility on the `standard-*` side, but the split between a +routing key and an output label remains implicit. A future cleanup could split +these into two fields (a `routing` enum and an observable `main_layout`) if the +output value ever needs a stable, documented set. + +### 5. Dead defensive code in `_main_column` + +Both `if self.layout_label is None: raise ValueError(...)` and the +`KeyError -> ValueError("no extractor...")` are unreachable: `get_layout` always +assigns one of the four registered routing keys. Low priority; safe to drop if +touching this method for another reason. + +## Dependencies + +Issues 1-3 are blocked on capturing witnessed fixtures for the unexercised +layouts (`left-bar`, empty-`rso`, left-bar-or-top-bars + populated `rso`). Issues +4-5 are pure cleanups with no blocker, but low value on their own. diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json index 2bef2b3..b32dc50 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json @@ -3,7 +3,7 @@ "captcha": true, "infinity_scroll": true, "language": "en", - "main_layout": "standard-0", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json index f646ee1..513712d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json @@ -3,7 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", - "main_layout": "standard-4", + "main_layout": "standard-airfares", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json index 449f12d..b1dfe43 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json @@ -3,7 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", - "main_layout": "standard-0", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json index f64a0d1..a31d7ee 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json @@ -3,7 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", - "main_layout": "standard-0", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/test_extractor_main.py b/tests/test_extractor_main.py index 174df1a..555825c 100644 --- a/tests/test_extractor_main.py +++ b/tests/test_extractor_main.py @@ -130,24 +130,23 @@ def test_get_layout_label_top_bars_takes_precedence_over_left_bar(): assert _layout_label(f'{_RCNT_TOPBAR}
') == "top-bars" -# extract_from_standard: standard-0/1/2/4 sub-type dispatch --------------------- -# standard-0 (overview) and standard-4 (AIRFARES) are witnessed by fixtures; -# standard-1 (Songs) and standard-2 (SportsStandings) are NOT, so they are -# pinned here before any ladder refactor. +# extract_from_standard: standard-* sub-type dispatch -------------------------- +# standard-overview and standard-airfares are witnessed by fixtures; +# standard-songs and standard-sports-standings are NOT, so they are pinned here. -def test_standard_0_overview_extracts_tzhb6b_children(): +def test_standard_overview_extracts_tzhb6b_children(): em = _make_extractor( '
' '
ov 1
ov 2
' "
" ) res = em.extract_from_standard() - assert em.layout_label == "standard-0" + assert em.layout_label == "standard-overview" assert _texts(res) == ["ov 1", "ov 2"] -def test_standard_1_songs_extracts_tab_children(): +def test_standard_songs_extracts_tab_children(): em = _make_extractor( '
' '
tabwrap
' @@ -155,38 +154,38 @@ def test_standard_1_songs_extracts_tab_children(): "
" ) res = em.extract_from_standard() - assert em.layout_label == "standard-1" + assert em.layout_label == "standard-songs" assert _texts(res) == ["song A", "song B"] -def test_standard_2_sports_standings_extracts_tab_children(): +def test_standard_sports_standings_extracts_tab_children(): em = _make_extractor( '
' '
team 1
team 2
' "
" ) res = em.extract_from_standard() - assert em.layout_label == "standard-2" + assert em.layout_label == "standard-sports-standings" assert _texts(res) == ["team 1", "team 2"] -def test_standard_4_airfares_extracts_a6k0a_children(): +def test_standard_airfares_extracts_a6k0a_children(): em = _make_extractor( '
' '
fare 1
fare 2
' "
" ) res = em.extract_from_standard() - assert em.layout_label == "standard-4" + assert em.layout_label == "standard-airfares" assert _texts(res) == ["fare 1", "fare 2"] -def test_standard_3_label_on_empty_rso_fallback(): +def test_standard_fallback_label_on_empty_rso(): # No kp-wp-tab-* container matches and the generic extraction yields nothing, - # so the label settles on the standard-3 empty fallback. + # so the label settles on the standard-fallback empty path. em = _make_extractor('
') res = em.extract_from_standard() - assert em.layout_label == "standard-3" + assert em.layout_label == "standard-fallback" assert _texts(res) == [] diff --git a/tests/test_parse_serp.py b/tests/test_parse_serp.py index a61397c..d123a6c 100644 --- a/tests/test_parse_serp.py +++ b/tests/test_parse_serp.py @@ -178,12 +178,12 @@ def test_field_types(all_results): def test_features_expose_main_layout(all_parsed_serps): """Every SERP's features carries a str-or-None ``main_layout`` label, and - the witnessed fixture distribution (standard / standard-0 / standard-4) is - present -- pins the extractor->features wiring.""" + the witnessed fixture distribution (standard / standard-overview / + standard-airfares) is present -- pins the extractor->features wiring.""" seen = set() for serp in all_parsed_serps: assert "main_layout" in serp["features"] layout = serp["features"]["main_layout"] assert layout is None or isinstance(layout, str) seen.add(layout) - assert {"standard", "standard-0", "standard-4"} <= seen + assert {"standard", "standard-overview", "standard-airfares"} <= seen From d5aad5028f4934a01bd6a81325a2eff19d64d72c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 06:19:20 +0000 Subject: [PATCH 8/8] Guard _extract_from_standard_sub_type against unknown sub_type Address PR review: _STANDARD_LAYOUTS[sub_type] would KeyError if the helper were ever called with an unknown/empty sub_type (the signature still defaults to ""). Use .get() and return [] for a missing recipe, restoring the pre-refactor "unknown subtype -> empty result" behavior. Unreachable from the current caller, but makes the helper robust. Pinned with a test. --- WebSearcher/extractors/extractor_main.py | 5 ++++- tests/test_extractor_main.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index 3e243e8..7044cf3 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -216,7 +216,10 @@ def _extract_from_standard_sub_type(self, sub_type: str = "") -> list: return [] log.debug(f"main_layout: {self.layout_label} (update)") - spec = _STANDARD_LAYOUTS[sub_type] + spec = _STANDARD_LAYOUTS.get(sub_type) + if spec is None: + # Unknown/empty sub_type -> no recipe; preserve "empty result". + return [] top_divs = ( ExtractorMain.extract_top_divs( self.layout_divs["top-bars"], rso=self.layout_divs["rso"] diff --git a/tests/test_extractor_main.py b/tests/test_extractor_main.py index 555825c..79de178 100644 --- a/tests/test_extractor_main.py +++ b/tests/test_extractor_main.py @@ -189,6 +189,14 @@ def test_standard_fallback_label_on_empty_rso(): assert _texts(res) == [] +def test_extract_from_standard_sub_type_unknown_returns_empty(): + # A sub_type with no recipe in _STANDARD_LAYOUTS yields an empty result + # rather than raising (the method default is still ""). + em = _make_extractor('
x
') + assert em._extract_from_standard_sub_type("not-a-layout") == [] + assert em._extract_from_standard_sub_type() == [] + + # extract_from_top_bar: top-bars-divs / top-bars-children ----------------------