Skip to content
Merged
2 changes: 1 addition & 1 deletion WebSearcher/classifiers/footer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ def classify(cmpt: Node) -> str:

cmpt_type = "unknown"
for classifier in classifier_list:
cmpt_type = classifier(node)
if cmpt_type != "unknown":
break
cmpt_type = classifier(node)

# Fall back to main classifier
if cmpt_type == "unknown":
Expand Down
24 changes: 4 additions & 20 deletions WebSearcher/classifiers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from selectolax.lexbor import LexborNode as Node

from .. import logger
from .._slx import class_tokens, get_text
from .._slx import _iter_text_fragments, class_tokens, get_text
from ..component_types import header_text_to_type

log = logger.Logger().start(__name__)
Expand Down Expand Up @@ -186,10 +186,10 @@ def general(cmpt) -> str:
"""Classify general components."""
node: Node = cmpt
node_id = node.mem_id
cls = class_tokens(node)
# bs4 distinguished "class" present vs absent via ``"class" in cmpt.attrs``
# -- preserve that distinction explicitly.
if "class" in node.attributes:
cls = class_tokens(node)
conditions = {
"format-01": cls == ["g"],
"format-02": ("g" in cls) and ("Ww4FFb" in cls),
Expand Down Expand Up @@ -268,30 +268,14 @@ def knowledge_box(cmpt) -> str:
condition["locations"] = node.css_first("div.zd2Jbb") is not None
condition["events"] = node.css_first("g-card.URhAHe") is not None
condition["jobs"] = node.css_first("g-card.cvoI5e") is not None
# bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank
# text fragment in the subtree. Use the _slx walker indirectly via
# iter_text_fragments-style filter.
# bs4 ``next(iter(cmpt.stripped_strings), None)`` -- first non-blank text
# fragment in the subtree; ``_iter_text_fragments`` replicates stripped_strings.
first_text: str | None = None
for s in (get_text(node) or "").splitlines():
s2 = s.strip()
if s2:
first_text = s2
break
if first_text is None:
# fallback: pull first non-whitespace fragment from text walker
text = get_text(node) or ""
first_text = text.strip().split()[0] if text.strip() else None
# Simpler & more faithful: replicate stripped_strings exactly via the
# _slx iter_text_fragments walker.
from .._slx import _iter_text_fragments

for raw in _iter_text_fragments(node):
stripped = raw.strip()
if stripped:
first_text = stripped
break
else:
first_text = None
if first_text is not None:
condition["covid_alert"] = first_text == "COVID-19 alert"
for condition_type, conditions in condition.items():
Expand Down
20 changes: 11 additions & 9 deletions WebSearcher/component_parsers/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ def _next_sibling_with_text(node: Node) -> Node | None:

_ARIA_RATING_RE = re.compile(r"Rated\s+(\d+(?:\.\d+)?)\s+out of\s+(\d+)")
_ARIA_REVIEWS_RE = re.compile(r"\(([\d,]+)\)\s*user reviews?")
_RATING_NUMERIC_RE = re.compile(r"^\d*[.]?\d*$")
_RATING_VOTES_RE = re.compile(r" vote[s]?| review[s]?")
_RATING_REVIEW_BY_RE = re.compile("Review by")
_PRODUCT_SPLIT_RE = re.compile("-|·")


def parse_rating_aria_label(aria_label: str) -> dict:
Expand All @@ -202,26 +206,24 @@ def parse_rating_aria_label(aria_label: str) -> dict:

def parse_ratings(text) -> dict:
text = [t.strip() for t in text]
numeric = re.compile(r"^\d*[.]?\d*$")
rating = re.split("Rating: ", text[0])[-1]
details: dict = {"rating": float(rating)} if numeric.match(rating) else {"rating": rating}
details: dict = (
{"rating": float(rating)} if _RATING_NUMERIC_RE.match(rating) else {"rating": rating}
)

if len(text) > 1:
str_match_0 = re.compile(" vote[s]?| review[s]?")
str_match_1 = re.compile("Review by")
if str_match_0.search(text[1]):
reviews = re.split(str_match_0, text[1])[0]
if _RATING_VOTES_RE.search(text[1]):
reviews = re.split(_RATING_VOTES_RE, text[1])[0]
reviews = reviews.replace(",", "")[1:] # [1:] drops unicode char
details["reviews"] = int(reviews)
elif str_match_1.search(text[1]):
elif _RATING_REVIEW_BY_RE.search(text[1]):
details["reviews"] = 1

return details


def parse_product(text: str) -> dict:
split_match = re.compile("-|·")
parts = re.split(split_match, text)
parts = re.split(_PRODUCT_SPLIT_RE, text)
if len(parts) == 1:
return {"price": parts[0].strip()[1:]}
return {"price": parts[0].strip()[1:], "stock": parts[1].strip()[1:]}
Expand Down
8 changes: 2 additions & 6 deletions WebSearcher/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,13 @@ def __init__(
self.type = type
self.cmpt_rank = cmpt_rank
self.result_list: list[dict] = []
self.result_counter = 0

def __str__(self) -> str:
return str(vars(self))

def to_dict(self) -> dict:
return self.__dict__

def get_metadata(self, key_filter=["section", "cmpt_rank"]) -> dict:
return {k: v for k, v in self.to_dict().items() if k in key_filter}

def classify_component(self, classify_type_func: Callable | None = None):
"""Classify the component type"""
if classify_type_func:
Expand Down Expand Up @@ -150,7 +146,7 @@ def __iter__(self):

def add_component(self, elem, section="unknown", type="unknown", cmpt_rank=None):
"""Add a component to the list of components"""
cmpt_rank = self.cmpt_rank_counter if not cmpt_rank else cmpt_rank
cmpt_rank = self.cmpt_rank_counter if cmpt_rank is None else cmpt_rank
component = Component(elem, section, type, cmpt_rank)

self.components.append(component)
Expand Down Expand Up @@ -227,4 +223,4 @@ def export_component_results(self):
return results

def to_records(self):
return [Component.to_dict() for Component in self.components]
return [cmpt.to_dict() for cmpt in self.components]
Loading