From 764cc2c500d0a0bab58dcb6ea7d91cab10d2e006 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 18:23:46 +0000 Subject: [PATCH 1/4] Initial plan From b515f73214c7f718fac2a310ce4208126f3ca2d7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 18:27:36 +0000 Subject: [PATCH 2/4] feat: cluster related stories before AI briefing generation Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716 Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- src/wazzup/feeds.py | 120 +++++++++++++++++++++++++++- src/wazzup/pipeline.py | 3 +- src/wazzup/scoring.py | 4 +- tests/fixtures/story-clustering.xml | 32 ++++++++ tests/test_ai.py | 24 ++++++ tests/test_feeds.py | 55 ++++++++++++- tests/test_pipeline.py | 81 +++++++++++++++++++ 7 files changed, 314 insertions(+), 5 deletions(-) create mode 100644 tests/fixtures/story-clustering.xml diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py index 1f9746b..6151655 100644 --- a/src/wazzup/feeds.py +++ b/src/wazzup/feeds.py @@ -7,7 +7,7 @@ import urllib.request import xml.etree.ElementTree as ET from dataclasses import replace -from datetime import UTC, datetime +from datetime import UTC, datetime, timedelta from email.utils import parsedate_to_datetime from .models import ContentItem, SourceConfig, SourceStatus @@ -28,6 +28,33 @@ TAG_RE = re.compile(r"<[^>]+>") WHITESPACE_RE = re.compile(r"\s+") NON_WORD_RE = re.compile(r"[^\w\s-]", re.UNICODE) +MIN_STORY_SHARED_KEYWORDS = 2 +MAX_STORY_TIME_DELTA = timedelta(hours=18) +STORY_STOPWORDS = { + "about", + "after", + "analysis", + "announces", + "attack", + "attacks", + "breaking", + "commentary", + "cyber", + "for", + "from", + "incident", + "inside", + "latest", + "new", + "news", + "report", + "reports", + "security", + "story", + "the", + "threat", + "update", +} def utc_now() -> datetime: @@ -259,3 +286,94 @@ def deduplicate(items: list[ContentItem]) -> list[ContentItem]: ) winners.append(replace(winner, related_items=related_items) if related_items else winner) return sorted(winners, key=lambda item: item.published_at, reverse=True) + + +def _keyword_tokens(value: str) -> set[str]: + text = NON_WORD_RE.sub(" ", clean_text(value).lower()) + text = WHITESPACE_RE.sub(" ", text).strip() + return { + token + for token in text.split(" ") + if token and (len(token) >= 4 or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS + } + + +def _story_keywords(item: ContentItem) -> set[str]: + return _keyword_tokens(item.title) | _keyword_tokens(item.summary) | _keyword_tokens(" ".join(item.tags)) + + +def _canonical_path_tokens(item: ContentItem) -> set[str]: + parsed = urllib.parse.urlsplit(item.canonical_url) + return {token for token in parsed.path.lower().split("/") if token and token != "index"} + + +def _story_anchor_tokens(tokens: set[str]) -> set[str]: + return { + token + for token in tokens + if any(char.isdigit() for char in token) or token.startswith(("cve", "apt", "kb")) or len(token) >= 8 + } + + +def _parse_content_timestamp(value: str) -> datetime: + normalized = value.replace("Z", "+00:00") + parsed = datetime.fromisoformat(normalized) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=UTC) + return parsed.astimezone(UTC) + + +def _story_related(left: ContentItem, right: ContentItem) -> bool: + if abs(_parse_content_timestamp(left.published_at) - _parse_content_timestamp(right.published_at)) > MAX_STORY_TIME_DELTA: + return False + left_title = normalize_title(left.title) + right_title = normalize_title(right.title) + if left_title and left_title == right_title: + return True + left_keywords = _story_keywords(left) + right_keywords = _story_keywords(right) + if not left_keywords or not right_keywords: + return False + shared_keywords = left_keywords & right_keywords + if len(shared_keywords) < MIN_STORY_SHARED_KEYWORDS: + return False + if not (_story_anchor_tokens(shared_keywords) | (_canonical_path_tokens(left) & _canonical_path_tokens(right))): + return False + overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords))) + return overlap >= 0.5 + + +def _flatten_group_items(item: ContentItem) -> list[ContentItem]: + return [replace(item, related_items=()), *(replace(related, related_items=()) for related in item.related_items)] + + +def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]: + groups: list[list[ContentItem]] = [] + for item in items: + matching_indexes = [ + index for index, group_items in enumerate(groups) if any(_story_related(item, candidate) for candidate in group_items) + ] + if not matching_indexes: + groups.append([item]) + continue + first_index = matching_indexes[0] + groups[first_index].append(item) + for index in reversed(matching_indexes[1:]): + groups[first_index].extend(groups[index]) + del groups[index] + + winners: list[ContentItem] = [] + for group_items in groups: + expanded = [entry for grouped in group_items for entry in _flatten_group_items(grouped)] + deduped_by_id: dict[str, ContentItem] = {item.id: item for item in expanded} + flattened = list(deduped_by_id.values()) + winner = max(flattened, key=item_priority) + related_items = tuple( + sorted( + (replace(item, related_items=()) for item in flattened if item.id != winner.id), + key=item_priority, + reverse=True, + ) + ) + winners.append(replace(winner, related_items=related_items) if related_items else winner) + return sorted(winners, key=lambda item: item.published_at, reverse=True) diff --git a/src/wazzup/pipeline.py b/src/wazzup/pipeline.py index 50fd877..ad15603 100644 --- a/src/wazzup/pipeline.py +++ b/src/wazzup/pipeline.py @@ -10,7 +10,7 @@ from .ai import SummaryRequest, provider_from_env from .config import load_app_config, load_sources -from .feeds import deduplicate, fetch_and_parse, isoformat, parse_feed, utc_now +from .feeds import cluster_related_stories, deduplicate, fetch_and_parse, isoformat, parse_feed, utc_now from .models import BriefingKind, ContentItem, ScoredItem, SourceStatus from .publisher import briefing_path, publish_outputs from .scoring import parse_iso, score_items @@ -188,6 +188,7 @@ def generate(argv: Sequence[str] | None = None) -> dict: if kind == "hourly": content_window_start, content_window_end = rolling_day_window(now, app_config.timezone) window_items = filter_items_to_window(items, content_window_start, content_window_end) + window_items = cluster_related_stories(window_items) scored = score_items(window_items, sources, app_config, now) if kind == "hourly": scored = prioritize_hourly_new_items(scored, now) diff --git a/src/wazzup/scoring.py b/src/wazzup/scoring.py index 81dbb23..bed816e 100644 --- a/src/wazzup/scoring.py +++ b/src/wazzup/scoring.py @@ -2,7 +2,7 @@ from datetime import UTC, datetime -from .feeds import canonicalize_url, stable_hash +from .feeds import stable_hash from .models import AppConfig, ContentItem, ScoredItem, SourceConfig @@ -64,7 +64,7 @@ def score_items( score += 6.0 reasons.append("priority threat intelligence source") - duplicate_group_id = f"dup-{stable_hash(canonicalize_url(item.canonical_url))}" + duplicate_group_id = f"dup-{stable_hash(*sorted([item.id, *(related.id for related in item.related_items)]))}" scored.append( ScoredItem( item=item, diff --git a/tests/fixtures/story-clustering.xml b/tests/fixtures/story-clustering.xml new file mode 100644 index 0000000..917bf99 --- /dev/null +++ b/tests/fixtures/story-clustering.xml @@ -0,0 +1,32 @@ + + + + Story clustering fixtures + https://example.com + Fixture feed for clustering tests + + Acme VPN CVE-2026-4242 exploited in active campaign + https://example.com/security/acme-vpn-cve-2026-4242 + story-1 + Tue, 06 May 2026 09:00:00 GMT + Researchers report active exploitation of Acme VPN CVE-2026-4242 with emergency guidance. + security + + + Emergency patch for Acme VPN after CVE-2026-4242 exploitation + https://example.net/alerts/acme-vpn-cve-2026-4242-patch + story-2 + Tue, 06 May 2026 10:00:00 GMT + Vendors ship fixes while defenders track the same Acme VPN CVE-2026-4242 campaign. + vulnerability + + + Acme VPN releases regional maintenance update for managed gateways + https://example.org/releases/acme-vpn-maintenance-update + story-3 + Tue, 06 May 2026 11:00:00 GMT + Acme VPN announced a maintenance rollout for gateway stability in Europe. + security + + + diff --git a/tests/test_ai.py b/tests/test_ai.py index a7ca0b4..d58e065 100644 --- a/tests/test_ai.py +++ b/tests/test_ai.py @@ -179,6 +179,30 @@ def test_prompt_allows_synthesized_bullets_for_related_items(self) -> None: self.assertIn("same story", style_guide) self.assertIn("cite every source item ID", style_guide) + def test_prompt_payload_includes_related_items_for_grouped_story_context(self) -> None: + source = load_sources("config/sources.yml")[0] + item = parse_feed(source, Path("tests/fixtures/microsoft-security-blog.xml").read_bytes())[0] + related = replace(item, id="item-related") + scored = score_items( + [replace(item, related_items=(related,))], + [source], + load_app_config("config/interests.yml"), + datetime(2026, 5, 6, tzinfo=UTC), + ) + payload = build_prompt_payload( + SummaryRequest( + kind="hourly", + window_start="2026-05-06T20:00:00Z", + window_end="2026-05-06T21:00:00Z", + generated_at="2026-05-06T21:00:00Z", + timezone="Europe/Amsterdam", + summary_language="en", + items=scored, + ) + ) + + self.assertEqual("item-related", payload["items"][0]["relatedItems"][0]["id"]) + def test_prompt_style_guide_requires_english_translation(self) -> None: payload = build_prompt_payload( SummaryRequest( diff --git a/tests/test_feeds.py b/tests/test_feeds.py index fe42cf3..06d2edf 100644 --- a/tests/test_feeds.py +++ b/tests/test_feeds.py @@ -5,7 +5,7 @@ from pathlib import Path from wazzup.config import load_sources -from wazzup.feeds import canonicalize_url, deduplicate, parse_feed +from wazzup.feeds import canonicalize_url, cluster_related_stories, deduplicate, parse_feed class FeedTests(unittest.TestCase): @@ -62,6 +62,59 @@ def test_deduplicate_preserves_related_sources_for_same_story(self) -> None: self.assertEqual(["item-related-source"], [item.id for item in deduped[0].related_items]) self.assertEqual("related-source", deduped[0].related_items[0].source_id) + def test_deduplicate_groups_fixture_duplicates(self) -> None: + source = load_sources("config/sources.yml")[0] + fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes()) + duplicate = replace( + fixture_items[0], + id="item-duplicate-source", + source_id="duplicate-source", + source_name="Duplicate Source", + source_tag="Duplicate", + canonical_url="https://duplicate.example/acme-vpn-cve-2026-4242", + url="https://duplicate.example/acme-vpn-cve-2026-4242", + raw_ref="duplicate-entry", + ) + + deduped = deduplicate([fixture_items[0], duplicate]) + + self.assertEqual(1, len(deduped)) + self.assertEqual(["item-duplicate-source"], [item.id for item in deduped[0].related_items]) + + def test_cluster_related_stories_groups_near_duplicates(self) -> None: + source = load_sources("config/sources.yml")[0] + fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes()) + first_story = fixture_items[0] + near_duplicate = replace( + fixture_items[1], + id="item-near-duplicate-source", + source_id="near-duplicate-source", + source_name="Near Duplicate Source", + source_tag="Near Duplicate", + ) + + clustered = cluster_related_stories([first_story, near_duplicate]) + + self.assertEqual(1, len(clustered)) + self.assertEqual(["item-near-duplicate-source"], [item.id for item in clustered[0].related_items]) + + def test_cluster_related_stories_keeps_same_topic_different_story_separate(self) -> None: + source = load_sources("config/sources.yml")[0] + fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes()) + first_story = fixture_items[0] + different_story = replace( + fixture_items[2], + id="item-different-story-source", + source_id="different-story-source", + source_name="Different Story Source", + source_tag="Different Story", + ) + + clustered = cluster_related_stories([first_story, different_story]) + + self.assertEqual(2, len(clustered)) + self.assertTrue(all(not item.related_items for item in clustered)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 89f6c05..fe0f6f3 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -48,6 +48,28 @@ def scored_item(item_id: str, published_at: str, score: float) -> ScoredItem: ) +def content_item(item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str) -> ContentItem: + return ContentItem( + schema_version=1, + id=item_id, + source_id=source_id, + source_name=f"{source_id} name", + source_tag=source_id.upper(), + source_type="rss", + title=title, + url=canonical_url, + canonical_url=canonical_url, + published_at=published_at, + discovered_at=published_at, + authors=[], + tags=["security"], + language="en", + summary=summary, + content_hash=f"hash-{item_id}", + raw_ref=item_id, + ) + + class PipelineTests(unittest.TestCase): def test_hourly_selection_prioritizes_new_articles(self) -> None: now = datetime(2026, 5, 6, 15, 42, tzinfo=UTC) @@ -310,6 +332,65 @@ def test_exclude_already_featured_hourly_items_drops_fully_featured_related_grou self.assertEqual(["item-2"], [item.item.id for item in fresh_items]) + def test_generate_clusters_related_story_items_before_ai_summary(self) -> None: + previous_provider = os.environ.get("AI_PROVIDER") + os.environ["AI_PROVIDER"] = "fake" + try: + with tempfile.TemporaryDirectory() as tmp_dir: + public_dir = Path(tmp_dir) + fixed_now = datetime(2026, 5, 6, 10, 30, tzinfo=UTC) + grouped_items = [ + content_item( + "item-primary", + "source-a", + "Acme VPN CVE-2026-4242 exploited in active campaign", + "https://example.com/security/acme-vpn-cve-2026-4242", + "2026-05-06T09:20:00Z", + "Researchers report active exploitation of Acme VPN CVE-2026-4242.", + ), + content_item( + "item-related", + "source-b", + "Emergency patch for Acme VPN after CVE-2026-4242 exploitation", + "https://example.net/alerts/acme-vpn-cve-2026-4242-patch", + "2026-05-06T09:45:00Z", + "Vendors ship fixes for the same Acme VPN CVE-2026-4242 campaign.", + ), + ] + with ( + patch("wazzup.pipeline.utc_now", return_value=fixed_now), + patch( + "wazzup.pipeline.collect_items", + return_value=( + grouped_items, + [], + [], + ), + ), + ): + latest = generate( + [ + "--public-dir", + str(public_dir), + "--force-briefing", + "hourly", + "--max-items", + "5", + ] + ) + briefing = json.loads((public_dir / latest["latestBriefingUrl"]).read_text(encoding="utf-8")) + articles = json.loads((public_dir / latest["latestArticlesUrl"]).read_text(encoding="utf-8")) + + self.assertEqual(1, len(articles["items"])) + self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"])) + self.assertEqual(1, len(briefing["sections"][0]["bullets"])) + self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"])) + finally: + if previous_provider is None: + os.environ.pop("AI_PROVIDER", None) + else: + os.environ["AI_PROVIDER"] = previous_provider + if __name__ == "__main__": unittest.main() From 01eb8afcc1df3941ddc9a18b1796cd802ec9075d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 18:28:53 +0000 Subject: [PATCH 3/4] refactor: tighten story clustering readability and tests Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716 Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- src/wazzup/feeds.py | 31 ++++++++++++++++++++++--------- tests/test_pipeline.py | 2 ++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py index 6151655..5287b39 100644 --- a/src/wazzup/feeds.py +++ b/src/wazzup/feeds.py @@ -323,8 +323,16 @@ def _parse_content_timestamp(value: str) -> datetime: return parsed.astimezone(UTC) -def _story_related(left: ContentItem, right: ContentItem) -> bool: - if abs(_parse_content_timestamp(left.published_at) - _parse_content_timestamp(right.published_at)) > MAX_STORY_TIME_DELTA: +def _story_related(left: ContentItem, right: ContentItem, published_at_by_item_id: dict[str, datetime] | None = None) -> bool: + published_left = ( + published_at_by_item_id[left.id] if published_at_by_item_id and left.id in published_at_by_item_id else _parse_content_timestamp(left.published_at) + ) + published_right = ( + published_at_by_item_id[right.id] + if published_at_by_item_id and right.id in published_at_by_item_id + else _parse_content_timestamp(right.published_at) + ) + if abs(published_left - published_right) > MAX_STORY_TIME_DELTA: return False left_title = normalize_title(left.title) right_title = normalize_title(right.title) @@ -337,7 +345,9 @@ def _story_related(left: ContentItem, right: ContentItem) -> bool: shared_keywords = left_keywords & right_keywords if len(shared_keywords) < MIN_STORY_SHARED_KEYWORDS: return False - if not (_story_anchor_tokens(shared_keywords) | (_canonical_path_tokens(left) & _canonical_path_tokens(right))): + has_anchor_tokens = bool(_story_anchor_tokens(shared_keywords)) + has_shared_path_tokens = bool(_canonical_path_tokens(left) & _canonical_path_tokens(right)) + if not (has_anchor_tokens or has_shared_path_tokens): return False overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords))) return overlap >= 0.5 @@ -349,9 +359,12 @@ def _flatten_group_items(item: ContentItem) -> list[ContentItem]: def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]: groups: list[list[ContentItem]] = [] + published_at_by_item_id = {item.id: _parse_content_timestamp(item.published_at) for item in items} for item in items: matching_indexes = [ - index for index, group_items in enumerate(groups) if any(_story_related(item, candidate) for candidate in group_items) + index + for index, group_items in enumerate(groups) + if any(_story_related(item, candidate, published_at_by_item_id) for candidate in group_items) ] if not matching_indexes: groups.append([item]) @@ -364,13 +377,13 @@ def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]: winners: list[ContentItem] = [] for group_items in groups: - expanded = [entry for grouped in group_items for entry in _flatten_group_items(grouped)] - deduped_by_id: dict[str, ContentItem] = {item.id: item for item in expanded} - flattened = list(deduped_by_id.values()) - winner = max(flattened, key=item_priority) + flattened_items = [entry for grouped in group_items for entry in _flatten_group_items(grouped)] + deduped_by_id: dict[str, ContentItem] = {item.id: item for item in flattened_items} + clustered_items = list(deduped_by_id.values()) + winner = max(clustered_items, key=item_priority) related_items = tuple( sorted( - (replace(item, related_items=()) for item in flattened if item.id != winner.id), + (replace(item, related_items=()) for item in clustered_items if item.id != winner.id), key=item_priority, reverse=True, ) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index fe0f6f3..a63a8ee 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -383,6 +383,8 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None: self.assertEqual(1, len(articles["items"])) self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"])) + self.assertGreater(len(briefing["sections"]), 0) + self.assertGreater(len(briefing["sections"][0]["bullets"]), 0) self.assertEqual(1, len(briefing["sections"][0]["bullets"])) self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"])) finally: From 085dd6af3841c7bf4a332623db4ad99a77756f99 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 18:30:17 +0000 Subject: [PATCH 4/4] refactor: extract clustering thresholds and improve test clarity Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716 Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- src/wazzup/feeds.py | 11 ++++++++--- src/wazzup/scoring.py | 3 ++- tests/test_pipeline.py | 9 +++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py index 5287b39..f5a9160 100644 --- a/src/wazzup/feeds.py +++ b/src/wazzup/feeds.py @@ -30,6 +30,9 @@ NON_WORD_RE = re.compile(r"[^\w\s-]", re.UNICODE) MIN_STORY_SHARED_KEYWORDS = 2 MAX_STORY_TIME_DELTA = timedelta(hours=18) +MIN_KEYWORD_LENGTH = 4 +MIN_ANCHOR_TOKEN_LENGTH = 8 +MIN_STORY_KEYWORD_OVERLAP_RATIO = 0.5 STORY_STOPWORDS = { "about", "after", @@ -294,7 +297,7 @@ def _keyword_tokens(value: str) -> set[str]: return { token for token in text.split(" ") - if token and (len(token) >= 4 or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS + if token and (len(token) >= MIN_KEYWORD_LENGTH or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS } @@ -311,7 +314,9 @@ def _story_anchor_tokens(tokens: set[str]) -> set[str]: return { token for token in tokens - if any(char.isdigit() for char in token) or token.startswith(("cve", "apt", "kb")) or len(token) >= 8 + if any(char.isdigit() for char in token) + or token.startswith(("cve", "apt", "kb")) + or len(token) >= MIN_ANCHOR_TOKEN_LENGTH } @@ -350,7 +355,7 @@ def _story_related(left: ContentItem, right: ContentItem, published_at_by_item_i if not (has_anchor_tokens or has_shared_path_tokens): return False overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords))) - return overlap >= 0.5 + return overlap >= MIN_STORY_KEYWORD_OVERLAP_RATIO def _flatten_group_items(item: ContentItem) -> list[ContentItem]: diff --git a/src/wazzup/scoring.py b/src/wazzup/scoring.py index bed816e..ad4ebc1 100644 --- a/src/wazzup/scoring.py +++ b/src/wazzup/scoring.py @@ -64,7 +64,8 @@ def score_items( score += 6.0 reasons.append("priority threat intelligence source") - duplicate_group_id = f"dup-{stable_hash(*sorted([item.id, *(related.id for related in item.related_items)]))}" + grouped_item_ids = sorted([item.id, *(related.id for related in item.related_items)]) + duplicate_group_id = f"dup-{stable_hash(*grouped_item_ids)}" scored.append( ScoredItem( item=item, diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index a63a8ee..bcde2af 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -48,7 +48,9 @@ def scored_item(item_id: str, published_at: str, score: float) -> ScoredItem: ) -def content_item(item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str) -> ContentItem: +def make_test_content_item( + item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str +) -> ContentItem: return ContentItem( schema_version=1, id=item_id, @@ -340,7 +342,7 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None: public_dir = Path(tmp_dir) fixed_now = datetime(2026, 5, 6, 10, 30, tzinfo=UTC) grouped_items = [ - content_item( + make_test_content_item( "item-primary", "source-a", "Acme VPN CVE-2026-4242 exploited in active campaign", @@ -348,7 +350,7 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None: "2026-05-06T09:20:00Z", "Researchers report active exploitation of Acme VPN CVE-2026-4242.", ), - content_item( + make_test_content_item( "item-related", "source-b", "Emergency patch for Acme VPN after CVE-2026-4242 exploitation", @@ -384,7 +386,6 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None: self.assertEqual(1, len(articles["items"])) self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"])) self.assertGreater(len(briefing["sections"]), 0) - self.assertGreater(len(briefing["sections"][0]["bullets"]), 0) self.assertEqual(1, len(briefing["sections"][0]["bullets"])) self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"])) finally: