From 764cc2c500d0a0bab58dcb6ea7d91cab10d2e006 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 8 May 2026 18:23:46 +0000
Subject: [PATCH 1/4] Initial plan


From b515f73214c7f718fac2a310ce4208126f3ca2d7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 8 May 2026 18:27:36 +0000
Subject: [PATCH 2/4] feat: cluster related stories before AI briefing
 generation

Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716

Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com>
---
 src/wazzup/feeds.py                 | 120 +++++++++++++++++++++++++++-
 src/wazzup/pipeline.py              |   3 +-
 src/wazzup/scoring.py               |   4 +-
 tests/fixtures/story-clustering.xml |  32 ++++++++
 tests/test_ai.py                    |  24 ++++++
 tests/test_feeds.py                 |  55 ++++++++++++-
 tests/test_pipeline.py              |  81 +++++++++++++++++++
 7 files changed, 314 insertions(+), 5 deletions(-)
 create mode 100644 tests/fixtures/story-clustering.xml

diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py
index 1f9746b..6151655 100644
--- a/src/wazzup/feeds.py
+++ b/src/wazzup/feeds.py
@@ -7,7 +7,7 @@
 import urllib.request
 import xml.etree.ElementTree as ET
 from dataclasses import replace
-from datetime import UTC, datetime
+from datetime import UTC, datetime, timedelta
 from email.utils import parsedate_to_datetime
 
 from .models import ContentItem, SourceConfig, SourceStatus
@@ -28,6 +28,33 @@
 TAG_RE = re.compile(r"<[^>]+>")
 WHITESPACE_RE = re.compile(r"\s+")
 NON_WORD_RE = re.compile(r"[^\w\s-]", re.UNICODE)
+MIN_STORY_SHARED_KEYWORDS = 2
+MAX_STORY_TIME_DELTA = timedelta(hours=18)
+STORY_STOPWORDS = {
+    "about",
+    "after",
+    "analysis",
+    "announces",
+    "attack",
+    "attacks",
+    "breaking",
+    "commentary",
+    "cyber",
+    "for",
+    "from",
+    "incident",
+    "inside",
+    "latest",
+    "new",
+    "news",
+    "report",
+    "reports",
+    "security",
+    "story",
+    "the",
+    "threat",
+    "update",
+}
 
 
 def utc_now() -> datetime:
@@ -259,3 +286,94 @@ def deduplicate(items: list[ContentItem]) -> list[ContentItem]:
         )
         winners.append(replace(winner, related_items=related_items) if related_items else winner)
     return sorted(winners, key=lambda item: item.published_at, reverse=True)
+
+
+def _keyword_tokens(value: str) -> set[str]:
+    text = NON_WORD_RE.sub(" ", clean_text(value).lower())
+    text = WHITESPACE_RE.sub(" ", text).strip()
+    return {
+        token
+        for token in text.split(" ")
+        if token and (len(token) >= 4 or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS
+    }
+
+
+def _story_keywords(item: ContentItem) -> set[str]:
+    return _keyword_tokens(item.title) | _keyword_tokens(item.summary) | _keyword_tokens(" ".join(item.tags))
+
+
+def _canonical_path_tokens(item: ContentItem) -> set[str]:
+    parsed = urllib.parse.urlsplit(item.canonical_url)
+    return {token for token in parsed.path.lower().split("/") if token and token != "index"}
+
+
+def _story_anchor_tokens(tokens: set[str]) -> set[str]:
+    return {
+        token
+        for token in tokens
+        if any(char.isdigit() for char in token) or token.startswith(("cve", "apt", "kb")) or len(token) >= 8
+    }
+
+
+def _parse_content_timestamp(value: str) -> datetime:
+    normalized = value.replace("Z", "+00:00")
+    parsed = datetime.fromisoformat(normalized)
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=UTC)
+    return parsed.astimezone(UTC)
+
+
+def _story_related(left: ContentItem, right: ContentItem) -> bool:
+    if abs(_parse_content_timestamp(left.published_at) - _parse_content_timestamp(right.published_at)) > MAX_STORY_TIME_DELTA:
+        return False
+    left_title = normalize_title(left.title)
+    right_title = normalize_title(right.title)
+    if left_title and left_title == right_title:
+        return True
+    left_keywords = _story_keywords(left)
+    right_keywords = _story_keywords(right)
+    if not left_keywords or not right_keywords:
+        return False
+    shared_keywords = left_keywords & right_keywords
+    if len(shared_keywords) < MIN_STORY_SHARED_KEYWORDS:
+        return False
+    if not (_story_anchor_tokens(shared_keywords) | (_canonical_path_tokens(left) & _canonical_path_tokens(right))):
+        return False
+    overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords)))
+    return overlap >= 0.5
+
+
+def _flatten_group_items(item: ContentItem) -> list[ContentItem]:
+    return [replace(item, related_items=()), *(replace(related, related_items=()) for related in item.related_items)]
+
+
+def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]:
+    groups: list[list[ContentItem]] = []
+    for item in items:
+        matching_indexes = [
+            index for index, group_items in enumerate(groups) if any(_story_related(item, candidate) for candidate in group_items)
+        ]
+        if not matching_indexes:
+            groups.append([item])
+            continue
+        first_index = matching_indexes[0]
+        groups[first_index].append(item)
+        for index in reversed(matching_indexes[1:]):
+            groups[first_index].extend(groups[index])
+            del groups[index]
+
+    winners: list[ContentItem] = []
+    for group_items in groups:
+        expanded = [entry for grouped in group_items for entry in _flatten_group_items(grouped)]
+        deduped_by_id: dict[str, ContentItem] = {item.id: item for item in expanded}
+        flattened = list(deduped_by_id.values())
+        winner = max(flattened, key=item_priority)
+        related_items = tuple(
+            sorted(
+                (replace(item, related_items=()) for item in flattened if item.id != winner.id),
+                key=item_priority,
+                reverse=True,
+            )
+        )
+        winners.append(replace(winner, related_items=related_items) if related_items else winner)
+    return sorted(winners, key=lambda item: item.published_at, reverse=True)
diff --git a/src/wazzup/pipeline.py b/src/wazzup/pipeline.py
index 50fd877..ad15603 100644
--- a/src/wazzup/pipeline.py
+++ b/src/wazzup/pipeline.py
@@ -10,7 +10,7 @@
 
 from .ai import SummaryRequest, provider_from_env
 from .config import load_app_config, load_sources
-from .feeds import deduplicate, fetch_and_parse, isoformat, parse_feed, utc_now
+from .feeds import cluster_related_stories, deduplicate, fetch_and_parse, isoformat, parse_feed, utc_now
 from .models import BriefingKind, ContentItem, ScoredItem, SourceStatus
 from .publisher import briefing_path, publish_outputs
 from .scoring import parse_iso, score_items
@@ -188,6 +188,7 @@ def generate(argv: Sequence[str] | None = None) -> dict:
     if kind == "hourly":
         content_window_start, content_window_end = rolling_day_window(now, app_config.timezone)
     window_items = filter_items_to_window(items, content_window_start, content_window_end)
+    window_items = cluster_related_stories(window_items)
     scored = score_items(window_items, sources, app_config, now)
     if kind == "hourly":
         scored = prioritize_hourly_new_items(scored, now)
diff --git a/src/wazzup/scoring.py b/src/wazzup/scoring.py
index 81dbb23..bed816e 100644
--- a/src/wazzup/scoring.py
+++ b/src/wazzup/scoring.py
@@ -2,7 +2,7 @@
 
 from datetime import UTC, datetime
 
-from .feeds import canonicalize_url, stable_hash
+from .feeds import stable_hash
 from .models import AppConfig, ContentItem, ScoredItem, SourceConfig
 
 
@@ -64,7 +64,7 @@ def score_items(
             score += 6.0
             reasons.append("priority threat intelligence source")
 
-        duplicate_group_id = f"dup-{stable_hash(canonicalize_url(item.canonical_url))}"
+        duplicate_group_id = f"dup-{stable_hash(*sorted([item.id, *(related.id for related in item.related_items)]))}"
         scored.append(
             ScoredItem(
                 item=item,
diff --git a/tests/fixtures/story-clustering.xml b/tests/fixtures/story-clustering.xml
new file mode 100644
index 0000000..917bf99
--- /dev/null
+++ b/tests/fixtures/story-clustering.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Story clustering fixtures</title>
+    <link>https://example.com</link>
+    <description>Fixture feed for clustering tests</description>
+    <item>
+      <title>Acme VPN CVE-2026-4242 exploited in active campaign</title>
+      <link>https://example.com/security/acme-vpn-cve-2026-4242</link>
+      <guid>story-1</guid>
+      <pubDate>Tue, 06 May 2026 09:00:00 GMT</pubDate>
+      <description>Researchers report active exploitation of Acme VPN CVE-2026-4242 with emergency guidance.</description>
+      <category>security</category>
+    </item>
+    <item>
+      <title>Emergency patch for Acme VPN after CVE-2026-4242 exploitation</title>
+      <link>https://example.net/alerts/acme-vpn-cve-2026-4242-patch</link>
+      <guid>story-2</guid>
+      <pubDate>Tue, 06 May 2026 10:00:00 GMT</pubDate>
+      <description>Vendors ship fixes while defenders track the same Acme VPN CVE-2026-4242 campaign.</description>
+      <category>vulnerability</category>
+    </item>
+    <item>
+      <title>Acme VPN releases regional maintenance update for managed gateways</title>
+      <link>https://example.org/releases/acme-vpn-maintenance-update</link>
+      <guid>story-3</guid>
+      <pubDate>Tue, 06 May 2026 11:00:00 GMT</pubDate>
+      <description>Acme VPN announced a maintenance rollout for gateway stability in Europe.</description>
+      <category>security</category>
+    </item>
+  </channel>
+</rss>
diff --git a/tests/test_ai.py b/tests/test_ai.py
index a7ca0b4..d58e065 100644
--- a/tests/test_ai.py
+++ b/tests/test_ai.py
@@ -179,6 +179,30 @@ def test_prompt_allows_synthesized_bullets_for_related_items(self) -> None:
         self.assertIn("same story", style_guide)
         self.assertIn("cite every source item ID", style_guide)
 
+    def test_prompt_payload_includes_related_items_for_grouped_story_context(self) -> None:
+        source = load_sources("config/sources.yml")[0]
+        item = parse_feed(source, Path("tests/fixtures/microsoft-security-blog.xml").read_bytes())[0]
+        related = replace(item, id="item-related")
+        scored = score_items(
+            [replace(item, related_items=(related,))],
+            [source],
+            load_app_config("config/interests.yml"),
+            datetime(2026, 5, 6, tzinfo=UTC),
+        )
+        payload = build_prompt_payload(
+            SummaryRequest(
+                kind="hourly",
+                window_start="2026-05-06T20:00:00Z",
+                window_end="2026-05-06T21:00:00Z",
+                generated_at="2026-05-06T21:00:00Z",
+                timezone="Europe/Amsterdam",
+                summary_language="en",
+                items=scored,
+            )
+        )
+
+        self.assertEqual("item-related", payload["items"][0]["relatedItems"][0]["id"])
+
     def test_prompt_style_guide_requires_english_translation(self) -> None:
         payload = build_prompt_payload(
             SummaryRequest(
diff --git a/tests/test_feeds.py b/tests/test_feeds.py
index fe42cf3..06d2edf 100644
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 from wazzup.config import load_sources
-from wazzup.feeds import canonicalize_url, deduplicate, parse_feed
+from wazzup.feeds import canonicalize_url, cluster_related_stories, deduplicate, parse_feed
 
 
 class FeedTests(unittest.TestCase):
@@ -62,6 +62,59 @@ def test_deduplicate_preserves_related_sources_for_same_story(self) -> None:
         self.assertEqual(["item-related-source"], [item.id for item in deduped[0].related_items])
         self.assertEqual("related-source", deduped[0].related_items[0].source_id)
 
+    def test_deduplicate_groups_fixture_duplicates(self) -> None:
+        source = load_sources("config/sources.yml")[0]
+        fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes())
+        duplicate = replace(
+            fixture_items[0],
+            id="item-duplicate-source",
+            source_id="duplicate-source",
+            source_name="Duplicate Source",
+            source_tag="Duplicate",
+            canonical_url="https://duplicate.example/acme-vpn-cve-2026-4242",
+            url="https://duplicate.example/acme-vpn-cve-2026-4242",
+            raw_ref="duplicate-entry",
+        )
+
+        deduped = deduplicate([fixture_items[0], duplicate])
+
+        self.assertEqual(1, len(deduped))
+        self.assertEqual(["item-duplicate-source"], [item.id for item in deduped[0].related_items])
+
+    def test_cluster_related_stories_groups_near_duplicates(self) -> None:
+        source = load_sources("config/sources.yml")[0]
+        fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes())
+        first_story = fixture_items[0]
+        near_duplicate = replace(
+            fixture_items[1],
+            id="item-near-duplicate-source",
+            source_id="near-duplicate-source",
+            source_name="Near Duplicate Source",
+            source_tag="Near Duplicate",
+        )
+
+        clustered = cluster_related_stories([first_story, near_duplicate])
+
+        self.assertEqual(1, len(clustered))
+        self.assertEqual(["item-near-duplicate-source"], [item.id for item in clustered[0].related_items])
+
+    def test_cluster_related_stories_keeps_same_topic_different_story_separate(self) -> None:
+        source = load_sources("config/sources.yml")[0]
+        fixture_items = parse_feed(source, Path("tests/fixtures/story-clustering.xml").read_bytes())
+        first_story = fixture_items[0]
+        different_story = replace(
+            fixture_items[2],
+            id="item-different-story-source",
+            source_id="different-story-source",
+            source_name="Different Story Source",
+            source_tag="Different Story",
+        )
+
+        clustered = cluster_related_stories([first_story, different_story])
+
+        self.assertEqual(2, len(clustered))
+        self.assertTrue(all(not item.related_items for item in clustered))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 89f6c05..fe0f6f3 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -48,6 +48,28 @@ def scored_item(item_id: str, published_at: str, score: float) -> ScoredItem:
     )
 
 
+def content_item(item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str) -> ContentItem:
+    return ContentItem(
+        schema_version=1,
+        id=item_id,
+        source_id=source_id,
+        source_name=f"{source_id} name",
+        source_tag=source_id.upper(),
+        source_type="rss",
+        title=title,
+        url=canonical_url,
+        canonical_url=canonical_url,
+        published_at=published_at,
+        discovered_at=published_at,
+        authors=[],
+        tags=["security"],
+        language="en",
+        summary=summary,
+        content_hash=f"hash-{item_id}",
+        raw_ref=item_id,
+    )
+
+
 class PipelineTests(unittest.TestCase):
     def test_hourly_selection_prioritizes_new_articles(self) -> None:
         now = datetime(2026, 5, 6, 15, 42, tzinfo=UTC)
@@ -310,6 +332,65 @@ def test_exclude_already_featured_hourly_items_drops_fully_featured_related_grou
 
         self.assertEqual(["item-2"], [item.item.id for item in fresh_items])
 
+    def test_generate_clusters_related_story_items_before_ai_summary(self) -> None:
+        previous_provider = os.environ.get("AI_PROVIDER")
+        os.environ["AI_PROVIDER"] = "fake"
+        try:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                public_dir = Path(tmp_dir)
+                fixed_now = datetime(2026, 5, 6, 10, 30, tzinfo=UTC)
+                grouped_items = [
+                    content_item(
+                        "item-primary",
+                        "source-a",
+                        "Acme VPN CVE-2026-4242 exploited in active campaign",
+                        "https://example.com/security/acme-vpn-cve-2026-4242",
+                        "2026-05-06T09:20:00Z",
+                        "Researchers report active exploitation of Acme VPN CVE-2026-4242.",
+                    ),
+                    content_item(
+                        "item-related",
+                        "source-b",
+                        "Emergency patch for Acme VPN after CVE-2026-4242 exploitation",
+                        "https://example.net/alerts/acme-vpn-cve-2026-4242-patch",
+                        "2026-05-06T09:45:00Z",
+                        "Vendors ship fixes for the same Acme VPN CVE-2026-4242 campaign.",
+                    ),
+                ]
+                with (
+                    patch("wazzup.pipeline.utc_now", return_value=fixed_now),
+                    patch(
+                        "wazzup.pipeline.collect_items",
+                        return_value=(
+                            grouped_items,
+                            [],
+                            [],
+                        ),
+                    ),
+                ):
+                    latest = generate(
+                        [
+                            "--public-dir",
+                            str(public_dir),
+                            "--force-briefing",
+                            "hourly",
+                            "--max-items",
+                            "5",
+                        ]
+                    )
+                briefing = json.loads((public_dir / latest["latestBriefingUrl"]).read_text(encoding="utf-8"))
+                articles = json.loads((public_dir / latest["latestArticlesUrl"]).read_text(encoding="utf-8"))
+
+                self.assertEqual(1, len(articles["items"]))
+                self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"]))
+                self.assertEqual(1, len(briefing["sections"][0]["bullets"]))
+                self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"]))
+        finally:
+            if previous_provider is None:
+                os.environ.pop("AI_PROVIDER", None)
+            else:
+                os.environ["AI_PROVIDER"] = previous_provider
+
 
 if __name__ == "__main__":
     unittest.main()

From 01eb8afcc1df3941ddc9a18b1796cd802ec9075d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 8 May 2026 18:28:53 +0000
Subject: [PATCH 3/4] refactor: tighten story clustering readability and tests

Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716

Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com>
---
 src/wazzup/feeds.py    | 31 ++++++++++++++++++++++---------
 tests/test_pipeline.py |  2 ++
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py
index 6151655..5287b39 100644
--- a/src/wazzup/feeds.py
+++ b/src/wazzup/feeds.py
@@ -323,8 +323,16 @@ def _parse_content_timestamp(value: str) -> datetime:
     return parsed.astimezone(UTC)
 
 
-def _story_related(left: ContentItem, right: ContentItem) -> bool:
-    if abs(_parse_content_timestamp(left.published_at) - _parse_content_timestamp(right.published_at)) > MAX_STORY_TIME_DELTA:
+def _story_related(left: ContentItem, right: ContentItem, published_at_by_item_id: dict[str, datetime] | None = None) -> bool:
+    published_left = (
+        published_at_by_item_id[left.id] if published_at_by_item_id and left.id in published_at_by_item_id else _parse_content_timestamp(left.published_at)
+    )
+    published_right = (
+        published_at_by_item_id[right.id]
+        if published_at_by_item_id and right.id in published_at_by_item_id
+        else _parse_content_timestamp(right.published_at)
+    )
+    if abs(published_left - published_right) > MAX_STORY_TIME_DELTA:
         return False
     left_title = normalize_title(left.title)
     right_title = normalize_title(right.title)
@@ -337,7 +345,9 @@ def _story_related(left: ContentItem, right: ContentItem) -> bool:
     shared_keywords = left_keywords & right_keywords
     if len(shared_keywords) < MIN_STORY_SHARED_KEYWORDS:
         return False
-    if not (_story_anchor_tokens(shared_keywords) | (_canonical_path_tokens(left) & _canonical_path_tokens(right))):
+    has_anchor_tokens = bool(_story_anchor_tokens(shared_keywords))
+    has_shared_path_tokens = bool(_canonical_path_tokens(left) & _canonical_path_tokens(right))
+    if not (has_anchor_tokens or has_shared_path_tokens):
         return False
     overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords)))
     return overlap >= 0.5
@@ -349,9 +359,12 @@ def _flatten_group_items(item: ContentItem) -> list[ContentItem]:
 
 def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]:
     groups: list[list[ContentItem]] = []
+    published_at_by_item_id = {item.id: _parse_content_timestamp(item.published_at) for item in items}
     for item in items:
         matching_indexes = [
-            index for index, group_items in enumerate(groups) if any(_story_related(item, candidate) for candidate in group_items)
+            index
+            for index, group_items in enumerate(groups)
+            if any(_story_related(item, candidate, published_at_by_item_id) for candidate in group_items)
         ]
         if not matching_indexes:
             groups.append([item])
@@ -364,13 +377,13 @@ def cluster_related_stories(items: list[ContentItem]) -> list[ContentItem]:
 
     winners: list[ContentItem] = []
     for group_items in groups:
-        expanded = [entry for grouped in group_items for entry in _flatten_group_items(grouped)]
-        deduped_by_id: dict[str, ContentItem] = {item.id: item for item in expanded}
-        flattened = list(deduped_by_id.values())
-        winner = max(flattened, key=item_priority)
+        flattened_items = [entry for grouped in group_items for entry in _flatten_group_items(grouped)]
+        deduped_by_id: dict[str, ContentItem] = {item.id: item for item in flattened_items}
+        clustered_items = list(deduped_by_id.values())
+        winner = max(clustered_items, key=item_priority)
         related_items = tuple(
             sorted(
-                (replace(item, related_items=()) for item in flattened if item.id != winner.id),
+                (replace(item, related_items=()) for item in clustered_items if item.id != winner.id),
                 key=item_priority,
                 reverse=True,
             )
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index fe0f6f3..a63a8ee 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -383,6 +383,8 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None:
 
                 self.assertEqual(1, len(articles["items"]))
                 self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"]))
+                self.assertGreater(len(briefing["sections"]), 0)
+                self.assertGreater(len(briefing["sections"][0]["bullets"]), 0)
                 self.assertEqual(1, len(briefing["sections"][0]["bullets"]))
                 self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"]))
         finally:

From 085dd6af3841c7bf4a332623db4ad99a77756f99 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 8 May 2026 18:30:17 +0000
Subject: [PATCH 4/4] refactor: extract clustering thresholds and improve test
 clarity

Agent-Logs-Url: https://github.com/DevSecNinja/wazzup/sessions/89e73a10-8d6d-4a56-9e63-5d1f51cbe716

Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com>
---
 src/wazzup/feeds.py    | 11 ++++++++---
 src/wazzup/scoring.py  |  3 ++-
 tests/test_pipeline.py |  9 +++++----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/wazzup/feeds.py b/src/wazzup/feeds.py
index 5287b39..f5a9160 100644
--- a/src/wazzup/feeds.py
+++ b/src/wazzup/feeds.py
@@ -30,6 +30,9 @@
 NON_WORD_RE = re.compile(r"[^\w\s-]", re.UNICODE)
 MIN_STORY_SHARED_KEYWORDS = 2
 MAX_STORY_TIME_DELTA = timedelta(hours=18)
+MIN_KEYWORD_LENGTH = 4
+MIN_ANCHOR_TOKEN_LENGTH = 8
+MIN_STORY_KEYWORD_OVERLAP_RATIO = 0.5
 STORY_STOPWORDS = {
     "about",
     "after",
@@ -294,7 +297,7 @@ def _keyword_tokens(value: str) -> set[str]:
     return {
         token
         for token in text.split(" ")
-        if token and (len(token) >= 4 or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS
+        if token and (len(token) >= MIN_KEYWORD_LENGTH or any(char.isdigit() for char in token)) and token not in STORY_STOPWORDS
     }
 
 
@@ -311,7 +314,9 @@ def _story_anchor_tokens(tokens: set[str]) -> set[str]:
     return {
         token
         for token in tokens
-        if any(char.isdigit() for char in token) or token.startswith(("cve", "apt", "kb")) or len(token) >= 8
+        if any(char.isdigit() for char in token)
+        or token.startswith(("cve", "apt", "kb"))
+        or len(token) >= MIN_ANCHOR_TOKEN_LENGTH
     }
 
 
@@ -350,7 +355,7 @@ def _story_related(left: ContentItem, right: ContentItem, published_at_by_item_i
     if not (has_anchor_tokens or has_shared_path_tokens):
         return False
     overlap = len(shared_keywords) / max(1, min(len(left_keywords), len(right_keywords)))
-    return overlap >= 0.5
+    return overlap >= MIN_STORY_KEYWORD_OVERLAP_RATIO
 
 
 def _flatten_group_items(item: ContentItem) -> list[ContentItem]:
diff --git a/src/wazzup/scoring.py b/src/wazzup/scoring.py
index bed816e..ad4ebc1 100644
--- a/src/wazzup/scoring.py
+++ b/src/wazzup/scoring.py
@@ -64,7 +64,8 @@ def score_items(
             score += 6.0
             reasons.append("priority threat intelligence source")
 
-        duplicate_group_id = f"dup-{stable_hash(*sorted([item.id, *(related.id for related in item.related_items)]))}"
+        grouped_item_ids = sorted([item.id, *(related.id for related in item.related_items)])
+        duplicate_group_id = f"dup-{stable_hash(*grouped_item_ids)}"
         scored.append(
             ScoredItem(
                 item=item,
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index a63a8ee..bcde2af 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -48,7 +48,9 @@ def scored_item(item_id: str, published_at: str, score: float) -> ScoredItem:
     )
 
 
-def content_item(item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str) -> ContentItem:
+def make_test_content_item(
+    item_id: str, source_id: str, title: str, canonical_url: str, published_at: str, summary: str
+) -> ContentItem:
     return ContentItem(
         schema_version=1,
         id=item_id,
@@ -340,7 +342,7 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None:
                 public_dir = Path(tmp_dir)
                 fixed_now = datetime(2026, 5, 6, 10, 30, tzinfo=UTC)
                 grouped_items = [
-                    content_item(
+                    make_test_content_item(
                         "item-primary",
                         "source-a",
                         "Acme VPN CVE-2026-4242 exploited in active campaign",
@@ -348,7 +350,7 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None:
                         "2026-05-06T09:20:00Z",
                         "Researchers report active exploitation of Acme VPN CVE-2026-4242.",
                     ),
-                    content_item(
+                    make_test_content_item(
                         "item-related",
                         "source-b",
                         "Emergency patch for Acme VPN after CVE-2026-4242 exploitation",
@@ -384,7 +386,6 @@ def test_generate_clusters_related_story_items_before_ai_summary(self) -> None:
                 self.assertEqual(1, len(articles["items"]))
                 self.assertEqual({"item-primary", "item-related"}, set(briefing["sourceItemIds"]))
                 self.assertGreater(len(briefing["sections"]), 0)
-                self.assertGreater(len(briefing["sections"][0]["bullets"]), 0)
                 self.assertEqual(1, len(briefing["sections"][0]["bullets"]))
                 self.assertEqual({"item-primary", "item-related"}, set(briefing["sections"][0]["bullets"][0]["citations"]))
         finally: