diff --git a/WebSearcher/classifiers/footer.py b/WebSearcher/classifiers/footer.py index 14d03ba..8a470a6 100644 --- a/WebSearcher/classifiers/footer.py +++ b/WebSearcher/classifiers/footer.py @@ -28,9 +28,9 @@ def classify(cmpt: Node) -> str: cmpt_type = "unknown" for classifier in classifier_list: + cmpt_type = classifier(node) if cmpt_type != "unknown": break - cmpt_type = classifier(node) # Fall back to main classifier if cmpt_type == "unknown": diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 4134b08..42f39b5 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -3,7 +3,7 @@ from selectolax.lexbor import LexborNode as Node from .. import logger -from .._slx import class_tokens, get_text +from .._slx import _iter_text_fragments, class_tokens, get_text from ..component_types import header_text_to_type log = logger.Logger().start(__name__) @@ -186,10 +186,10 @@ def general(cmpt) -> str: """Classify general components.""" node: Node = cmpt node_id = node.mem_id - cls = class_tokens(node) # bs4 distinguished "class" present vs absent via ``"class" in cmpt.attrs`` # -- preserve that distinction explicitly. if "class" in node.attributes: + cls = class_tokens(node) conditions = { "format-01": cls == ["g"], "format-02": ("g" in cls) and ("Ww4FFb" in cls), @@ -268,30 +268,14 @@ def knowledge_box(cmpt) -> str: condition["locations"] = node.css_first("div.zd2Jbb") is not None condition["events"] = node.css_first("g-card.URhAHe") is not None condition["jobs"] = node.css_first("g-card.cvoI5e") is not None - # bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank - # text fragment in the subtree. Use the _slx walker indirectly via - # iter_text_fragments-style filter. + # bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank text + # fragment in the subtree; ``_iter_text_fragments`` replicates stripped_strings. first_text: str | None = None - for s in (get_text(node) or "").splitlines(): - s2 = s.strip() - if s2: - first_text = s2 - break - if first_text is None: - # fallback: pull first non-whitespace fragment from text walker - text = get_text(node) or "" - first_text = text.strip().split()[0] if text.strip() else None - # Simpler & more faithful: replicate stripped_strings exactly via the - # _slx iter_text_fragments walker. - from .._slx import _iter_text_fragments - for raw in _iter_text_fragments(node): stripped = raw.strip() if stripped: first_text = stripped break - else: - first_text = None if first_text is not None: condition["covid_alert"] = first_text == "COVID-19 alert" for condition_type, conditions in condition.items(): diff --git a/WebSearcher/component_parsers/general.py b/WebSearcher/component_parsers/general.py index 971f445..66bf1c7 100644 --- a/WebSearcher/component_parsers/general.py +++ b/WebSearcher/component_parsers/general.py @@ -183,6 +183,10 @@ def _next_sibling_with_text(node: Node) -> Node | None: _ARIA_RATING_RE = re.compile(r"Rated\s+(\d+(?:\.\d+)?)\s+out of\s+(\d+)") _ARIA_REVIEWS_RE = re.compile(r"\(([\d,]+)\)\s*user reviews?") +_RATING_NUMERIC_RE = re.compile(r"^\d*[.]?\d*$") +_RATING_VOTES_RE = re.compile(r" vote[s]?| review[s]?") +_RATING_REVIEW_BY_RE = re.compile("Review by") +_PRODUCT_SPLIT_RE = re.compile("-|·") def parse_rating_aria_label(aria_label: str) -> dict: @@ -202,26 +206,24 @@ def parse_rating_aria_label(aria_label: str) -> dict: def parse_ratings(text) -> dict: text = [t.strip() for t in text] - numeric = re.compile(r"^\d*[.]?\d*$") rating = re.split("Rating: ", text[0])[-1] - details: dict = {"rating": float(rating)} if numeric.match(rating) else {"rating": rating} + details: dict = ( + {"rating": float(rating)} if _RATING_NUMERIC_RE.match(rating) else {"rating": rating} + ) if len(text) > 1: - str_match_0 = re.compile(" vote[s]?| review[s]?") - str_match_1 = re.compile("Review by") - if str_match_0.search(text[1]): - reviews = re.split(str_match_0, text[1])[0] + if _RATING_VOTES_RE.search(text[1]): + reviews = re.split(_RATING_VOTES_RE, text[1])[0] reviews = reviews.replace(",", "")[1:] # [1:] drops unicode char details["reviews"] = int(reviews) - elif str_match_1.search(text[1]): + elif _RATING_REVIEW_BY_RE.search(text[1]): details["reviews"] = 1 return details def parse_product(text: str) -> dict: - split_match = re.compile("-|·") - parts = re.split(split_match, text) + parts = re.split(_PRODUCT_SPLIT_RE, text) if len(parts) == 1: return {"price": parts[0].strip()[1:]} return {"price": parts[0].strip()[1:], "stock": parts[1].strip()[1:]} diff --git a/WebSearcher/components.py b/WebSearcher/components.py index 1fd5c8a..2f7c8ef 100644 --- a/WebSearcher/components.py +++ b/WebSearcher/components.py @@ -40,7 +40,6 @@ def __init__( self.type = type self.cmpt_rank = cmpt_rank self.result_list: list[dict] = [] - self.result_counter = 0 def __str__(self) -> str: return str(vars(self)) @@ -48,9 +47,6 @@ def __str__(self) -> str: def to_dict(self) -> dict: return self.__dict__ - def get_metadata(self, key_filter=["section", "cmpt_rank"]) -> dict: - return {k: v for k, v in self.to_dict().items() if k in key_filter} - def classify_component(self, classify_type_func: Callable | None = None): """Classify the component type""" if classify_type_func: @@ -150,7 +146,7 @@ def __iter__(self): def add_component(self, elem, section="unknown", type="unknown", cmpt_rank=None): """Add a component to the list of components""" - cmpt_rank = self.cmpt_rank_counter if not cmpt_rank else cmpt_rank + cmpt_rank = self.cmpt_rank_counter if cmpt_rank is None else cmpt_rank component = Component(elem, section, type, cmpt_rank) self.components.append(component) @@ -227,4 +223,4 @@ def export_component_results(self): return results def to_records(self): - return [Component.to_dict() for Component in self.components] + return [cmpt.to_dict() for cmpt in self.components] diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index cf0c8b1..7044cf3 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Any from selectolax.lexbor import LexborNode as Node @@ -8,6 +9,58 @@ log = Logger().start(__name__) +@dataclass(frozen=True) +class _StandardLayout: + """A ``standard-*`` sub-layout recipe. + + ``detect_css`` locates the tab container and ``detect_sels`` is the set of + descendant selectors that must match for the layout to claim the page. + Extraction then reads ``extract_css`` (sometimes a different container than + detection) via one of two shapes: + + - ``keep_tokens`` set -> direct children carrying the first token that yields + any matches, concatenated with the top-divs *unfiltered*. + - ``keep_tokens`` is ``None`` -> all children (text-inclusive) concatenated + with the top-divs, then bad tags + empty nodes dropped. + """ + + detect_css: str + detect_sels: tuple[str, ...] + extract_css: str + keep_tokens: tuple[str, ...] | None + + +# Names mirror the observable ``kp-wp-tab-*`` container each sub-layout detects; +# detection precedence follows insertion order. ``standard-fallback`` is the +# empty-rso fallback handled directly in ``extract_from_standard`` (no tab). +_STANDARD_LAYOUTS: dict[str, _StandardLayout] = { + "standard-overview": _StandardLayout( + detect_css='div[id="kp-wp-tab-overview"]', + detect_sels=("div.TzHB6b", "div.A6K0A"), + extract_css='div[id="kp-wp-tab-overview"]', + keep_tokens=("TzHB6b", "A6K0A"), + ), + "standard-songs": _StandardLayout( + detect_css='div[id="kp-wp-tab-cont-Songs"][role="tabpanel"]', + detect_sels=("div",), + extract_css='div[id="kp-wp-tab-Songs"]', + keep_tokens=None, + ), + "standard-sports-standings": _StandardLayout( + detect_css='div[id="kp-wp-tab-SportsStandings"]', + detect_sels=("div",), + extract_css='div[id="kp-wp-tab-SportsStandings"]', + keep_tokens=None, + ), + "standard-airfares": _StandardLayout( + detect_css='div[id="kp-wp-tab-AIRFARES"]', + detect_sels=("div.A6K0A",), + extract_css='div[id="kp-wp-tab-AIRFARES"]', + keep_tokens=("A6K0A",), + ), +} + + def _filter_empty(nodes) -> list[Node]: return [n for n in nodes if n is not None and has_text(n)] @@ -127,36 +180,12 @@ def extract_from_standard(self, drop_tags: set | None = None) -> list: if rso_div is None: return [] drop_tags = drop_tags or {"script", "style", None} - standard_layouts = { - "standard-0": ( - rso_div.css_first('div[id="kp-wp-tab-overview"]'), - "div", - ["div.TzHB6b", "div.A6K0A"], - ), - "standard-1": ( - rso_div.css_first('div[id="kp-wp-tab-cont-Songs"][role="tabpanel"]'), - None, - None, - ), - "standard-2": ( - rso_div.css_first('div[id="kp-wp-tab-SportsStandings"]'), - None, - None, - ), - "standard-4": ( - rso_div.css_first('div[id="kp-wp-tab-AIRFARES"]'), - "div", - ["div.A6K0A"], - ), - } - for layout_name, (layout_div, check_tag, check_css_list) in standard_layouts.items(): - if layout_div is not None: - if check_tag and check_css_list: - for css in check_css_list: - if _find_all_with_class(layout_div, css, filter_empty=False): - return self._extract_from_standard_sub_type(layout_name) - elif _find_all_with_class(layout_div, "div", filter_empty=False): - return self._extract_from_standard_sub_type(layout_name) + for layout_name, spec in _STANDARD_LAYOUTS.items(): + container = rso_div.css_first(spec.detect_css) + if container is not None and any( + _find_all_with_class(container, sel, filter_empty=False) for sel in spec.detect_sels + ): + return self._extract_from_standard_sub_type(layout_name) top_divs = ( ExtractorMain.extract_top_divs( @@ -168,7 +197,7 @@ def extract_from_standard(self, drop_tags: set | None = None) -> list: col = top_divs + col col = [c for c in col if ExtractorMain.is_valid(c)] if not col: - self.layout_label = "standard-3" + self.layout_label = "standard-fallback" log.debug(f"main_layout: {self.layout_label} (update)") divs = _find_all_with_class(rso_div, 'div[id="kp-wp-tab-overview"]', filter_empty=False) col = [] @@ -187,96 +216,42 @@ def _extract_from_standard_sub_type(self, sub_type: str = "") -> list: return [] log.debug(f"main_layout: {self.layout_label} (update)") - if self.layout_label == "standard-0": - column: list = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] + spec = _STANDARD_LAYOUTS.get(sub_type) + if spec is None: + # Unknown/empty sub_type -> no recipe; preserve "empty result". + return [] + top_divs = ( + ExtractorMain.extract_top_divs( + self.layout_divs["top-bars"], rso=self.layout_divs["rso"] ) - tab_overview = rso_div.css_first('div[id="kp-wp-tab-overview"]') + or [] + ) + container = rso_div.css_first(spec.extract_css) + + if spec.keep_tokens is not None: + # Shape A: direct children carrying the first token that matches; + # result is top_divs + main_divs, returned unfiltered. main_divs: list[Node] = [] - if tab_overview is not None: - # recursive=False: direct children matching the class token. - main_divs = [ - c for c in tab_overview.iter(include_text=False) if "TzHB6b" in class_tokens(c) - ] - if not main_divs: - main_divs = [ - c - for c in tab_overview.iter(include_text=False) - if "A6K0A" in class_tokens(c) - ] - column.extend(top_divs) - column.extend(main_divs) + if container is not None: + children = list(container.iter(include_text=False)) + for token in spec.keep_tokens: + main_divs = [c for c in children if token in class_tokens(c)] + if main_divs: + break + column = top_divs + main_divs log.debug(f"main_components: {len(column):,}") return column - if self.layout_label == "standard-1": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - songs_div = rso_div.css_first('div[id="kp-wp-tab-Songs"]') - # bs4 ``list(songs_div.children)`` then filter ``.name not in {script,style}`` - # -- text nodes were dropped by the name filter (NavigableString.name is None - # which IS in {script,style,None} default... wait, not in {script,style}). - # The original kept text nodes here. To stay faithful, use include_text=True - # then filter by name; subsequent ``filter_empty_divs`` strips empties. - main_divs = list(songs_div.iter(include_text=True)) if songs_div is not None else [] - column.extend(top_divs) - column.extend(main_divs) - column = [ - d - for d in column - if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} - ] - column = _filter_empty(column) - return column - - if self.layout_label == "standard-2": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - sports_div = rso_div.css_first('div[id="kp-wp-tab-SportsStandings"]') - main_divs = list(sports_div.iter(include_text=True)) if sports_div is not None else [] - column.extend(top_divs) - column.extend(main_divs) - column = [ - d - for d in column - if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} - ] - column = _filter_empty(column) - return column - - if self.layout_label == "standard-4": - column = [] - top_divs = ( - ExtractorMain.extract_top_divs( - self.layout_divs["top-bars"], rso=self.layout_divs["rso"] - ) - or [] - ) - tab_airfares = rso_div.css_first('div[id="kp-wp-tab-AIRFARES"]') - main_divs = ( - [c for c in tab_airfares.iter(include_text=False) if "A6K0A" in class_tokens(c)] - if tab_airfares is not None - else [] - ) - column.extend(top_divs) - column.extend(main_divs) - return column - - return [] + # Shape B: all children (text-inclusive) + top_divs, then drop bad tags + # and empty nodes from the combined column. + main_divs = list(container.iter(include_text=True)) if container is not None else [] + column = top_divs + main_divs + column = [ + d + for d in column + if d.tag and not d.tag.startswith("-") and d.tag not in {"script", "style"} + ] + return _filter_empty(column) def extract_from_top_bar(self, drop_tags: set | None = None) -> list: drop_tags = drop_tags or {"script", "style", None} @@ -382,9 +357,10 @@ def extract_from_no_rso(self, drop_tags: set | None = None) -> list: out.append(div) else: out.extend(n for n in div.css("div.g") if n.mem_id != div.mem_id) - sec2 = self.soup.css_first("div.WvKfwe.a3spGf") - if sec2 is not None and class_tokens(sec2) == ["WvKfwe", "a3spGf"]: - out.extend(sec2.iter(include_text=True)) + # Page-level trailing section -- appended once, not per sec1 div. + sec2 = self.soup.css_first("div.WvKfwe.a3spGf") + if sec2 is not None and class_tokens(sec2) == ["WvKfwe", "a3spGf"]: + out.extend(sec2.iter(include_text=True)) return [ c for c in out diff --git a/WebSearcher/models/features.py b/WebSearcher/models/features.py index b39b7fa..490a9b4 100644 --- a/WebSearcher/models/features.py +++ b/WebSearcher/models/features.py @@ -13,3 +13,6 @@ class SERPFeatures(BaseModel): infinity_scroll: bool = False overlay_precise_location: bool = False captcha: bool = False + # Main-section layout label assigned during extraction, e.g. "standard", + # "standard-overview". None when no layout was detected. + main_layout: str | None = None diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index d66200b..e6a3050 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -39,9 +39,12 @@ def parse_serp(serp: str | Node) -> dict: finally: raw_serp_html.reset(token) + # Forward raw HTML (when available) + soup so feature extraction takes the + # regex path and reuses the already-parsed soup for shared probes. The main + # layout label is internal to extraction, so surface it on the features here. + features = FeatureExtractor.extract_features(serp, soup=soup) + features.main_layout = extractor.main_handler.layout_label return { - # Forward raw HTML (when available) + soup so feature extraction takes - # the regex path and reuses the already-parsed soup for shared probes. - "features": FeatureExtractor.extract_features(serp, soup=soup).model_dump(), + "features": features.model_dump(), "results": results, } diff --git a/WebSearcher/utils.py b/WebSearcher/utils.py index be0b2a5..e304967 100644 --- a/WebSearcher/utils.py +++ b/WebSearcher/utils.py @@ -129,13 +129,9 @@ def get_domain(url: str | None) -> str: if not url: return "" domain = tldextract.extract(url) - without_subdomain = ".".join([domain.domain, domain.suffix]) - with_subdomain = ".".join([domain.subdomain, domain.domain, domain.suffix]) - if domain.subdomain: - domain_str = without_subdomain if domain.subdomain == "www" else with_subdomain - else: - domain_str = without_subdomain - return domain_str + if domain.subdomain and domain.subdomain != "www": + return ".".join([domain.subdomain, domain.domain, domain.suffix]) + return ".".join([domain.domain, domain.suffix]) # Sessions --------------------------------------------------------------------- diff --git a/docs/plans/030-main-layout-known-issues.md b/docs/plans/030-main-layout-known-issues.md new file mode 100644 index 0000000..c130d3b --- /dev/null +++ b/docs/plans/030-main-layout-known-issues.md @@ -0,0 +1,125 @@ +--- +status: draft +branch: claude/simplify-websearcher-pkg +created: 2026-05-31T00:00:00-07:00 +completed: +pr: +--- + +# Main-Section Layout: Known Issues & Follow-ups + +## Status + +Draft / notes. Captures the layout-dispatch issues surfaced while making +`ExtractorMain.layout_label` observable (added as `features.main_layout`), +pinning every layout branch, refactoring the `standard-*` ladder into a +data-driven table, and renaming the opaque `standard-N` labels. + +What already landed on `claude/simplify-websearcher-pkg`: + +- `features.main_layout` exposes the main-section layout label in parsed output. +- Characterization pins in `tests/test_extractor_main.py` cover every observable + `layout_label` outcome (the SERP fixtures only witness `standard`, + `standard-overview`, `standard-airfares`). +- `extract_from_standard` / `_extract_from_standard_sub_type` collapsed into a + `_StandardLayout` table + two extraction shapes. +- `no-rso` `sec2` duplication bug fixed (was appended once per `sec1` div). +- `standard-N` renamed to observable, `kp-wp-tab-*`-derived names + (`standard-overview`/`-songs`/`-sports-standings`/`-airfares`) and the + bolted-on fallback renamed `standard-fallback`. + +This doc tracks what was deliberately **not** changed, because the fixes touch +extraction branches no fixture exercises and would be guesses without a real +witnessed SERP. + +## Layout label inventory + +Routing keys (`get_layout` -> `layout_extractors`): `standard`, `top-bars`, +`left-bar`, `no-rso`. Final observable values after extractor mutation: + +| Value | Witnessed in fixtures | +|---|---| +| `standard` | yes (62) | +| `standard-overview` | yes (3) | +| `standard-airfares` | yes (1) | +| `standard-songs` | no (pinned synthetically) | +| `standard-sports-standings` | no (pinned synthetically) | +| `standard-fallback` | no (pinned synthetically) | +| `top-bars-divs` | no | +| `top-bars-children` | no | +| `left-bar` | no | +| `no-rso` | no (extraction path pinned synthetically) | + +`top-bars` is never a final value (always mutates to `-divs`/`-children`). +`None` is unreachable from `get_layout`. + +## Open issues + +### 1. `left-bar` extraction is scoped to the whole document + +`extract_from_left_bar` returns `subtree_css(self.soup, "div.TzHB6b")` over the +entire page, ignoring `layout_divs["left-bar"]` (the `div.OeVqAd` it detected +on). `div.TzHB6b` is **not** a left-bar marker -- it is a generic knowledge-panel +container also keyed on by `standard-overview`, the `standard-fallback` path, and +the RHS knowledge-panel detector (`extractor_rhs.py`). So detection and +extraction target unrelated things, and extraction reaches across sections. + +- **Faithful port, not a regression.** Git history shows the bs4 original was + literally `self.soup.find_all("div", {"class": "TzHB6b"})` -- document-wide + from the start. +- **Likely benign today:** the registry is 1:1 (in a `left-bar` layout, this is + the only main extractor that runs, so there is no second pass to duplicate + against), and RHS panels are `remove()`d from the tree before main extraction, + so the cross-section bleed is largely closed in the full pipeline. The unit + pin (`test_left_bar_extracts_tzhb6b_document_wide`) documents the raw + document-wide reach in isolation. +- **Why deferred:** zero `left-bar` fixtures. Narrowing the scope to + `subtree_css(left_bar_div, "div.TzHB6b")` could just as easily break a real + left-bar SERP as fix one, with no witness either way. +- **To resolve:** capture a real `left-bar` SERP, snapshot it, then decide + scope as a one-line, fully-witnessed change. + +### 2. `standard-fallback` extraction body is effectively dead + +The fallback re-targets the same `kp-wp-tab-overview` + `TzHB6b`/`A6K0A` that +`standard-overview` already gates on and checks first. Any content the fallback +could find, `standard-overview` already caught -- so the label only persists on +an essentially empty `rso`, with its extraction body unreachable (consistent +with 0 fixtures and the pin asserting an empty result). + +- **To resolve:** confirm against a witnessed empty-`rso` SERP, then either drop + the dead extraction body (keep the label as a pure "nothing extracted" signal) + or delete the branch entirely if the label has no consumer value. + +### 3. `left-bar` / `top-bars` shadow a populated `rso` + +In `get_layout`, when a `left-bar` (or populated `top-bars` with no rso) is +present, the label resolves away from `standard` and `rso` is never read. A page +carrying both a left-bar/top-bars marker **and** a populated `rso` would drop the +`rso` results. Whether that combination occurs in the wild is unknown (no +fixture). The routing truth table is pinned in `test_extractor_main.py` +(`test_get_layout_label_*`), so the current precedence is at least locked. + +### 4. `layout_label` has a dual role (routing key + result descriptor) + +`layout_label` is both the dispatch key (`get_layout` -> `layout_extractors`) +and the mutated result descriptor (`standard-*`, `top-bars-*`). The mapping is +not 1:1 (`standard` -> 6 outcomes, `top-bars` -> 2), and the registry is keyed +on the pre-mutation value while output reports the post-mutation one. The table +refactor reduced the fragility on the `standard-*` side, but the split between a +routing key and an output label remains implicit. A future cleanup could split +these into two fields (a `routing` enum and an observable `main_layout`) if the +output value ever needs a stable, documented set. + +### 5. Dead defensive code in `_main_column` + +Both `if self.layout_label is None: raise ValueError(...)` and the +`KeyError -> ValueError("no extractor...")` are unreachable: `get_layout` always +assigns one of the four registered routing keys. Low priority; safe to drop if +touching this method for another reason. + +## Dependencies + +Issues 1-3 are blocked on capturing witnessed fixtures for the unexercised +layouts (`left-bar`, empty-`rso`, left-bar-or-top-bars + populated `rso`). Issues +4-5 are pure cleanups with no blocker, but low value on their own. diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json index db631d1..0433592 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[01f85d1329ba].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json index 720335a..c13b978 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[032572e185d3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json index e4cd6ec..ddd1019 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0410241ce1e2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json index 9a00a24..75d1bd0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0d3fc3b49b76].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json index 6332f59..70ce070 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[0ed311025efc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json index 112e573..2c914f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[130eba186e94].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json index 301bd1f..0f21fd5 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[18eccfe8454e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json index a056de9..bb280e7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[2c0aa0bbcd0c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json index 258ba6a..1979fc1 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[2d1b05a046b2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json index 398f17d..4f8cf8a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[305b53af69be].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json index 192cb40..b32dc50 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[30926d7c7ae9].json @@ -3,6 +3,7 @@ "captcha": true, "infinity_scroll": true, "language": "en", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json index 2b7c49a..9e98f20 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[30c5d6bdb650].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json index 30cff2f..c9fc124 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[39617f527744].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json index 3716b9f..1d18c6d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c03a4a2cb7c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json index 0aaa079..6ba005d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3c09a0f0c92f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json index 7c826c6..785d98d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[3f5efb1dc358].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json index f8fc1bc..486d7ed 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[45b6e019bfa2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json index 3a4ff55..78904bc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[4c8d8d2f226c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json index 92705b1..a1399da 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[53940e35cc92].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json index 69009dc..3a781cc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[56cbcf8cd4dc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json index 25298c1..109bae5 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[56f2eab63e9d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json index 1bbb7f2..b024604 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[5898b04fb534].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json index 06552f8..9b911b3 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6978d0cd767d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json index 4304134..5909a1b 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6aa70651b0cd].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json index c7b6ae0..8ad94f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6dc5bc34ff55].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json index bf8765b..7393344 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e206db14899].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json index ec9f822..513712d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[6e401e618433].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard-airfares", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json index 2cca1db..b743f89 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7049404a2dd6].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json index aab0e36..cfbfba7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7333536d2911].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json index f7bb471..108e1d2 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7ad9715f3597].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json index 1383a03..ea94cce 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7b89c00120e3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json index e481ee1..ca0713c 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[7d76d3a83ebc].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json index ac2e874..337ef87 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[811a27f92284].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json index af2f4e5..bb830f7 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[82e35954f552].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json index 3b1b738..5888a54 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[83b17a6a7750].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json index 3d429f2..5062e85 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8d1b75b71e7f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json index b787dba..337b9b0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8e820f7b024f].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json index 47f68a2..24318f2 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[8f98fa9c0bef].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json index 440f751..139c721 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9101d12ab778].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json index b037743..b0b0057 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[923a428c1c22].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json index 8ee64eb..a68e9cc 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[97404b7b7c61].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json index b5779df..4fdf4e3 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[984065877aad].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json index e779c2e..a8d2cae 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9a7e39d95bf0].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json index cdc6daa..fba156d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[9ed1baa7715d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json index 7729a0e..2f0230c 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c881e003e2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json index e582b27..32c7ae8 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[a6c8fe7fe769].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json index 3338c46..0f97034 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[aa594f199c3d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json index b862585..c813c91 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b15c5131b06c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json index 5c4fddd..b1dfe43 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b186024ec98a].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": true, "language": "en", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json index d291dfe..8b30692 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[b2e1777bf0f2].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json index 60a1808..f17ab51 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[be99c971b8f7].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json index 6f36220..b0c38b6 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[c48f8aa3f6da].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json index 652950f..b080037 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[c9ab650f5bda].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json index 0c348e3..d0dc815 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[cad43c3268a8].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json index 82a02d0..755956a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[ce37f114963e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json index c2470b0..93340b0 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1855fa9cd1c].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json index ff33c26..ca439fe 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d1ac0c4abb10].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json index 081a5f0..507bc53 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[d920789249af].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json index c7002b1..d353f98 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[da9b4fce9ab0].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json index de7d590..fd794d8 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[dc5861b33dda].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json index 3855578..cfb946a 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[e71a1cb4cd70].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json index 8fa54ca..51c8a84 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[e828d00dc1b3].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json index fb337e8..fe58b0e 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[eab14aa4ff5d].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json index be7214f..9828634 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[f006c9318116].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json index 30377e2..a31d7ee 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[f6fae1c9a96e].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard-overview", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json b/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json index ef290ec..18b3f5d 100644 --- a/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json +++ b/tests/__snapshots__/test_parse_serp/test_parse_serp[faa9c7c889db].json @@ -3,6 +3,7 @@ "captcha": false, "infinity_scroll": false, "language": "en", + "main_layout": "standard", "notice_no_results": false, "notice_server_error": false, "notice_shortened_query": false, diff --git a/tests/test_components.py b/tests/test_components.py index 9d85e29..2e3bca4 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -7,7 +7,7 @@ """ from WebSearcher import utils -from WebSearcher.components import Component +from WebSearcher.components import Component, ComponentList def comp(inner: str): @@ -45,3 +45,19 @@ def test_select_parser_returns_callable_for_unknown_and_registered(): assert callable(unknown_c.select_parser()) general_c = Component(comp("x"), section="main", type="general") assert callable(general_c.select_parser()) + + +def test_add_component_auto_increments_rank_from_zero(): + cl = ComponentList() + cl.add_component(comp("a"), section="main") + cl.add_component(comp("b"), section="main") + assert [c.cmpt_rank for c in cl.components] == [0, 1] + + +def test_add_component_honors_explicit_zero_rank(): + # cmpt_rank=None is the "auto-assign" sentinel; an explicit 0 must be kept, + # not treated as falsy-missing (it would otherwise pick up the counter, 1). + cl = ComponentList() + cl.add_component(comp("a"), section="main") # auto rank 0, counter -> 1 + cl.add_component(comp("b"), section="main", cmpt_rank=0) + assert cl.components[1].cmpt_rank == 0 diff --git a/tests/test_extractor_main.py b/tests/test_extractor_main.py index 002511f..79de178 100644 --- a/tests/test_extractor_main.py +++ b/tests/test_extractor_main.py @@ -1,7 +1,9 @@ -"""Tests for ExtractorMain component validity filtering""" +"""Tests for ExtractorMain component validity filtering and layout dispatch.""" from WebSearcher import utils -from WebSearcher.extractors.extractor_main import ExtractorMain +from WebSearcher._slx import get_text +from WebSearcher.components import ComponentList +from WebSearcher.extractors.extractor_main import ExtractorMain, _find_all_with_class def comp(html: str): @@ -9,6 +11,31 @@ def comp(html: str): return utils.make_soup(f'