diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 86b085b..33a12fb 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -125,12 +125,17 @@ jobs: ANSIBLE_HOST_KEY_CHECKING: "false" VAULT_ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} VAULT_NEWS_API_KEY: ${{ secrets.NEWS_API_KEY }} + # Ed25519 shard-signing seed (64 hex chars). Set the repo secret with + # `gh secret set ZEITGHOST_SIGNING_KEY`. Empty until then — env.j2 + # defaults it to '' so the deploy renders and ingest stays unsigned. + VAULT_ZEITGHOST_SIGNING_KEY: ${{ secrets.ZEITGHOST_SIGNING_KEY }} run: | TARGET="${{ inputs.target || 'us-ny1' }}" ansible-playbook deploy.yml \ -i "inventories/${TARGET}/hosts.yml" \ -e "vault_anthropic_api_key=${VAULT_ANTHROPIC_API_KEY}" \ -e "vault_news_api_key=${VAULT_NEWS_API_KEY}" \ + -e "vault_zeitghost_signing_key=${VAULT_ZEITGHOST_SIGNING_KEY}" \ -e "zeitghost_commit_sha=${{ github.sha }}" - name: Health check diff --git a/infra/ansible/templates/env.j2 b/infra/ansible/templates/env.j2 index 8ba9c55..6ab35f2 100644 --- a/infra/ansible/templates/env.j2 +++ b/infra/ansible/templates/env.j2 @@ -4,6 +4,14 @@ COMPOSE_PROJECT_NAME=zeitghost ANTHROPIC_API_KEY={{ vault_anthropic_api_key }} NEWS_API_KEY={{ vault_news_api_key }} +# Ed25519 seed (64 hex chars) for signing shards — mint with +# `zeitghost gen-signing-key`, store in the vault as vault_zeitghost_signing_key. +# Empty until provisioned (defaulted so the deploy still renders). +ZEITGHOST_SIGNING_KEY={{ vault_zeitghost_signing_key | default('') }} +# Set zeitghost_require_signing: 1 in the inventory to fail-close ingest once +# the key above is in the vault and verified. Off by default so rolling this +# out doesn't break ingest before the key is provisioned. +ZEITGHOST_REQUIRE_SIGNING={{ zeitghost_require_signing | default(0) }} ZEITGHOST_FEEDS={{ zeitghost_feeds | default('feeds/newsapi.yaml') }} ZEITGHOST_INTERVAL={{ zeitghost_interval | default(3600) }} ZEITGHOST_INGEST_LIMIT={{ zeitghost_ingest_limit | default(50) }} diff --git a/infra/docker/docker-compose.yml b/infra/docker/docker-compose.yml index 2ef2562..8e6391d 100644 --- a/infra/docker/docker-compose.yml +++ b/infra/docker/docker-compose.yml @@ -34,6 +34,10 @@ services: environment: ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} NEWS_API_KEY: ${NEWS_API_KEY:-} + # Shard signing — seed (secret) + fail-closed switch. Values live in .env + # (rendered 0600 by Ansible from the vault); only the references are here. + ZEITGHOST_SIGNING_KEY: ${ZEITGHOST_SIGNING_KEY:-} + ZEITGHOST_REQUIRE_SIGNING: ${ZEITGHOST_REQUIRE_SIGNING:-0} ZEITGHOST_FEEDS: ${ZEITGHOST_FEEDS:-feeds/newsapi.yaml} ZEITGHOST_INTERVAL: ${ZEITGHOST_INTERVAL:-3600} ZEITGHOST_INGEST_LIMIT: ${ZEITGHOST_INGEST_LIMIT:-50} diff --git a/tests/test_shard_integrity.py b/tests/test_shard_integrity.py new file mode 100644 index 0000000..974c4d4 --- /dev/null +++ b/tests/test_shard_integrity.py @@ -0,0 +1,226 @@ +"""Shard integrity & lineage-correctness tests. + +Covers three behaviors: +- #5 bias_score is never default-filled (skip an unscored article). +- #2 load collapses a revision chain to the latest shard per entity. +- #4 shards are signed when (and only when) a signing seed is configured. +""" + +import os + +import pytest + +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + +from zeitghost.bias import AnalyzedArticle +from zeitghost.fetcher import Article + + +def _article(url, score=0.5): + return AnalyzedArticle( + original=Article(title="T", url=url, summary="S", source_name="AP", + published="2026-05-12T00:00:00+00:00", + categories=["politics"]), + bias_score=score, bias_label="center", + variant_left_title="L", variant_left_summary="L sum", + variant_right_title="R", variant_right_summary="R sum", + ) + + +def _pubkey(seed: bytes) -> bytes: + return (Ed25519PrivateKey.from_private_bytes(seed).public_key() + .public_bytes(encoding=serialization.Encoding.Raw, + format=serialization.PublicFormat.Raw)) + + +# --- #5: bias_score skip-not-default -------------------------------------- + +def test_parse_bias_score_skips_missing_not_defaults(): + """A missing or non-numeric score yields None (caller skips) — never a + silent 0.5 that would mislabel an unscored article as 'center'.""" + from zeitghost.bias import _parse_bias_score + + # Present and numeric — passes through, including the valid 0.0 edge. + assert _parse_bias_score({"bias_score": 0.0}) == 0.0 + assert _parse_bias_score({"bias_score": 0.73}) == pytest.approx(0.73) + assert _parse_bias_score({"bias_score": "0.7"}) == pytest.approx(0.7) + # Absent / null / unparseable — None, so analyze_article returns None. + assert _parse_bias_score({}) is None + assert _parse_bias_score({"bias_score": None}) is None + assert _parse_bias_score({"bias_score": "left-ish"}) is None + assert _parse_bias_score({"bias_score": {}}) is None + + +# --- #2: load collapses revision chains to latest -------------------------- + +def test_load_returns_latest_revision_only(tmp_path): + """Two revisions of the same article (chained via parent_shard_id) collapse + to a single card on load — the newest one.""" + from zeitghost.shards import ( + init_store, article_to_internal_shard, load_articles_from_shards, + build_lineage_index, SCOPE_INTERNAL, + ) + store = init_store(tmp_path / "shards") + url = "https://e.com/evolving-story" + + a = _article(url, score=0.30) + first = article_to_internal_shard(a, store) + + a2 = _article(url, score=0.80) # re-analysis, same URL → same entity + lineage = build_lineage_index(store, SCOPE_INTERNAL) + second = article_to_internal_shard(a2, store, lineage_index=lineage) + assert second != first + + loaded = load_articles_from_shards(store) + assert len(loaded) == 1 + assert loaded[0].shard_id == second + assert loaded[0].parent_shard_id == first + assert loaded[0].bias_score == pytest.approx(0.80) + + +def test_load_keeps_distinct_entities_separate(tmp_path): + """Collapse is per-entity — two different articles both load.""" + from zeitghost.shards import ( + init_store, article_to_internal_shard, load_articles_from_shards, + ) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/a"), store) + article_to_internal_shard(_article("https://e.com/b"), store) + + loaded = load_articles_from_shards(store) + assert {a.original.url for a in loaded} == {"https://e.com/a", "https://e.com/b"} + + +# --- #4: opt-in signing ---------------------------------------------------- + +def test_shard_unsigned_without_seed(tmp_path): + from spiritwriter.fabric.store import ShardStore # noqa: F401 (type clarity) + from zeitghost.shards import ( + init_store, article_to_internal_shard, SCOPE_INTERNAL, + ) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/unsigned"), store) + + [shard] = list(store.by_scope(SCOPE_INTERNAL)) + assert shard.signature is None + assert shard.created_by is None + + +def test_shard_signed_with_seed_verifies_and_round_trips(tmp_path): + """A signed shard persists its signature + created_by, and the signature + verifies against the seed's public key after a store round-trip.""" + from spiritwriter.fabric.shard import pubkey_thumbprint + from zeitghost.shards import ( + init_store, article_to_internal_shard, article_to_sw_shard, + SCOPE_INTERNAL, SCOPE_SW_ARTICLE, + ) + seed = os.urandom(32) + pub = _pubkey(seed) + store = init_store(tmp_path / "shards") + + article_to_internal_shard(_article("https://e.com/signed"), store, + signing_seed=seed) + article_to_sw_shard(_article("https://e.com/signed"), store, + signing_seed=seed) + + for scope in (SCOPE_INTERNAL, SCOPE_SW_ARTICLE): + [shard] = list(store.by_scope(scope)) + assert shard.signature is not None + assert shard.created_by == pubkey_thumbprint(pub) + # verify() raises on a bad signature; True means the chain holds. + assert shard.verify(pub) is True + + +def test_tampered_signature_fails_verify(tmp_path): + """Flipping a byte of the signature must make verify() reject it — guards + against a future regression that signs the wrong payload.""" + from cryptography.exceptions import InvalidSignature + from zeitghost.shards import ( + init_store, article_to_internal_shard, SCOPE_INTERNAL, + ) + seed = os.urandom(32) + pub = _pubkey(seed) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/tamper"), store, + signing_seed=seed) + + [shard] = list(store.by_scope(SCOPE_INTERNAL)) + # Corrupt one hex nibble of the signature (wraps so 'f' stays valid hex). + sig = shard.signature + flipped = ("e" if sig[0] != "e" else "d") + sig[1:] + shard.signature = flipped + with pytest.raises(InvalidSignature): + shard.verify(pub) + + +# --- #5 end-to-end: analyze_article drops an unscored article -------------- + +def test_analyze_article_returns_none_when_bias_score_missing(monkeypatch): + """End-to-end: when the LLM response omits bias_score, analyze_article + returns None (skip) rather than constructing a default-0.5 article.""" + import asyncio + import zeitghost.bias as bias + + class _FakeProvider: + async def query(self, prompt, model=None): + # Valid JSON, variants present, but NO bias_score key. + return ('{"bias_label": "center", ' + '"variant_left": {"title": "L", "summary": "ls"}, ' + '"variant_right": {"title": "R", "summary": "rs"}}') + + monkeypatch.setattr(bias, "_get_provider", lambda: _FakeProvider()) + + art = Article(title="T", url="https://e.com/no-score", summary="s", + source_name="src", published="2026-05-12T00:00:00+00:00") + assert asyncio.run(bias.analyze_article(art)) is None + + +# --- resolve_signing_seed -------------------------------------------------- + +def test_resolve_signing_seed_from_env(monkeypatch): + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + seed = os.urandom(32) + monkeypatch.setenv(SIGNING_KEY_NAME, seed.hex()) + assert resolve_signing_seed() == seed + + +def test_resolve_signing_seed_absent_is_none(monkeypatch): + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + monkeypatch.delenv(SIGNING_KEY_NAME, raising=False) + # No env var (and the test keychain won't have this key) → opt-out. + assert resolve_signing_seed() is None + + +def test_resolve_signing_seed_malformed_is_none(monkeypatch): + """A fat-fingered key degrades to unsigned rather than crashing ingest.""" + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + monkeypatch.setenv(SIGNING_KEY_NAME, "not-hex-at-all") + assert resolve_signing_seed() is None + monkeypatch.setenv(SIGNING_KEY_NAME, "ab") # valid hex, but 1 byte ≠ 32 + assert resolve_signing_seed() is None + + +# --- signing_required (prod fail-closed switch) ---------------------------- + +def test_signing_required_off_by_default(monkeypatch): + from zeitghost.shards import signing_required + monkeypatch.delenv("ZEITGHOST_REQUIRE_SIGNING", raising=False) + assert signing_required() is False + assert signing_required(flag=False) is False + + +def test_signing_required_flag_wins(monkeypatch): + from zeitghost.shards import signing_required + monkeypatch.delenv("ZEITGHOST_REQUIRE_SIGNING", raising=False) + assert signing_required(flag=True) is True + + +def test_signing_required_env_truthy_variants(monkeypatch): + from zeitghost.shards import signing_required + for truthy in ("1", "true", "TRUE", "yes", "on"): + monkeypatch.setenv("ZEITGHOST_REQUIRE_SIGNING", truthy) + assert signing_required() is True, truthy + for falsy in ("0", "false", "no", "", "off"): + monkeypatch.setenv("ZEITGHOST_REQUIRE_SIGNING", falsy) + assert signing_required() is False, falsy diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e18b6ed..be04979 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -474,10 +474,15 @@ def test_shard_metadata_round_trip(tmp_path): shard_id_2 = article_to_internal_shard(a, store, lineage_index=lineage) assert shard_id_2 != shard_id_1 - # Loading now returns both revisions; the newer one points at the older + # Loading collapses the revision chain to the latest shard per entity, so + # the re-analyzed article renders as ONE card (its newest revision), not + # two — while still carrying the parent link back to the prior shard. revisions = load_articles_from_shards(store) - by_id = {r.shard_id: r for r in revisions} - assert by_id[shard_id_2].parent_shard_id == shard_id_1 + assert len(revisions) == 1 + latest = revisions[0] + assert latest.shard_id == shard_id_2 + assert latest.parent_shard_id == shard_id_1 + assert latest.bias_score == pytest.approx(0.74) def test_legacy_dump_helpers_smoke(): diff --git a/zeitghost/bias.py b/zeitghost/bias.py index fa9f414..0d0afaa 100644 --- a/zeitghost/bias.py +++ b/zeitghost/bias.py @@ -172,6 +172,24 @@ def bias_lean_display(score: float, center_tolerance: float = 0.025) -> str: """ +def _parse_bias_score(data: dict) -> float | None: + """Bias score from the LLM JSON, or None if absent/non-numeric. + + Returns None rather than defaulting to 0.5 — an article we couldn't score + must be *skipped*, never silently published as "center" (robustness + invariant #1: skip or guard, never default-fill). A literal 0.0 (full + left) is a valid score and passes through; only a missing or unparseable + value yields None. + """ + raw = data.get("bias_score") + if raw is None: + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + def _extract_json(text: str) -> dict | None: """Extract first valid JSON object from LLM response. @@ -201,8 +219,14 @@ def _extract_json(text: str) -> dict | None: return None -async def analyze_article(article: Article) -> AnalyzedArticle | None: - """Analyze one article — compute bias and generate both variants.""" +async def analyze_article(article: Article, + stats: dict | None = None) -> AnalyzedArticle | None: + """Analyze one article — compute bias and generate both variants. + + Pass a mutable `stats` dict to tally why articles are dropped — currently + the `no_score` reason (skipped for a missing/invalid bias_score). Lets + `analyze_batch` surface the count so a silent feed-drop is noticeable. + """ provider = _get_provider() # Prefer the trafilatura-extracted body when present (richer context for # Claude); fall back to NewsAPI's terse description if body fetch failed. @@ -221,11 +245,21 @@ async def analyze_article(article: Article) -> AnalyzedArticle | None: log.debug("Raw response: %s", response[:500]) return None + bias_score = _parse_bias_score(data) + if bias_score is None: + # No usable score → skip rather than default-fill to 0.5, which + # would mislabel an unscored article as "center" (invariant #1). + log.warning("Missing/invalid bias_score for '%s' — skipping", + article.title[:50]) + if stats is not None: + stats["no_score"] = stats.get("no_score", 0) + 1 + return None + left = data.get("variant_left", {}) or {} right = data.get("variant_right", {}) or {} return AnalyzedArticle( original=article, - bias_score=float(data.get("bias_score", 0.5)), + bias_score=bias_score, bias_label=data.get("bias_label", "center"), variant_left_title=left.get("title", article.title), variant_left_summary=left.get("summary", article.summary), @@ -242,11 +276,23 @@ async def analyze_article(article: Article) -> AnalyzedArticle | None: async def analyze_batch(articles: list[Article]) -> list[AnalyzedArticle]: - """Analyze a batch of articles. Skips any that fail to parse.""" + """Analyze a batch of articles. Skips any that fail to parse. + + Logs a WARNING with the count of articles dropped for a missing/invalid + bias_score — that path silently shrinks the feed (by design, vs. the old + default-fill), so the count surfaces an LLM regression that starts dropping + a chunk of articles. Logging propagates to the CLI's RichHandler, so it + shows on the operator's console during `zeitghost ingest`. + """ results = [] + stats: dict[str, int] = {} for article in articles: - analyzed = await analyze_article(article) + analyzed = await analyze_article(article, stats=stats) if analyzed is not None: results.append(analyzed) log.info("Analyzed %d articles, %d succeeded", len(articles), len(results)) + n_unscored = stats.get("no_score", 0) + if n_unscored: + log.warning("%d article(s) skipped — no usable bias_score in the LLM " + "response (not default-filled to center)", n_unscored) return results diff --git a/zeitghost/cli.py b/zeitghost/cli.py index fbc7bd1..85c2c60 100644 --- a/zeitghost/cli.py +++ b/zeitghost/cli.py @@ -36,16 +36,34 @@ def main(verbose: bool): help="Cap NewsAPI requests this run (default: remaining quota)") @click.option("--dry-run", is_flag=True, help="Fetch + analyze but skip writing shards") -def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): +@click.option("--require-signing", is_flag=True, + help="Fail if no valid signing key is configured, instead of " + "writing unsigned shards. Also enabled by " + "ZEITGHOST_REQUIRE_SIGNING=1 (prod sets this; local/CI leave " + "it off). See `zeitghost gen-signing-key`.") +def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool, + require_signing: bool): """Fetch new articles from NewsAPI, analyze with Claude, write shards.""" from zeitghost.fetcher import fetch_all, enrich_with_bodies from zeitghost.bias import analyze_batch from zeitghost.shards import (init_store, known_url_entities, is_known, article_to_internal_shard, article_to_sw_shard, - build_lineage_index, + build_lineage_index, resolve_signing_seed, + signing_required, SIGNING_KEY_NAME, SCOPE_INTERNAL, SCOPE_SW_ARTICLE) store = init_store() + + # Resolve the signing key up front so a "signing required but unconfigured" + # deploy fails fast — before spending any NewsAPI quota or Claude calls — + # rather than after fetching and analyzing a whole batch. + seed = resolve_signing_seed() + if seed is None and signing_required(require_signing): + raise click.ClickException( + f"Signing is required but no valid {SIGNING_KEY_NAME} is configured. " + f"Provision the key (see `zeitghost gen-signing-key`), or drop " + f"--require-signing / unset ZEITGHOST_REQUIRE_SIGNING for unsigned runs." + ) state_dir = Path(store.path).parent if hasattr(store, "path") else Path.home() / ".zeitghost" console.print(f"[bold]Fetching from {feeds}[/bold]") @@ -84,9 +102,16 @@ def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): # default, but possible if dedup is bypassed later) chain via parent_shard_id. internal_lineage = build_lineage_index(store, SCOPE_INTERNAL) sw_lineage = build_lineage_index(store, SCOPE_SW_ARTICLE) + # `seed` was resolved up front (for the fail-fast require check). Signing is + # opt-in: when it's None the shards are written unsigned. + if analyzed: + console.print(" Signing shards (ZEITGHOST_SIGNING_KEY configured)" + if seed else " [dim]No signing key — writing unsigned shards[/dim]") for a in analyzed: - article_to_internal_shard(a, store, lineage_index=internal_lineage) - article_to_sw_shard(a, store, lineage_index=sw_lineage) + article_to_internal_shard(a, store, lineage_index=internal_lineage, + signing_seed=seed) + article_to_sw_shard(a, store, lineage_index=sw_lineage, + signing_seed=seed) console.print(f" {len(analyzed) * 2} shards written " f"(internal + sw:article)") @@ -152,6 +177,64 @@ def analytics(output: str): console.print(f"[green]Analytics page generated at {path}[/green]") +@main.command(name="gen-signing-key") +@click.option("--store/--no-store", "store_key", default=True, + help="Store the key in the OS keychain (default). --no-store " + "only prints it, for manual provisioning on a headless host.") +@click.option("--print-seed", is_flag=True, + help="Also echo the secret seed after a successful keychain store " + "(for mirroring to the prod env var). --no-store always " + "prints it; otherwise the seed stays off-screen by default.") +def gen_signing_key(store_key: bool, print_seed: bool): + """Generate an Ed25519 key for signing shards' provenance. + + Shards written by `zeitghost ingest` are signed whenever ZEITGHOST_SIGNING_KEY + is resolvable (OS keychain or env var), stamping each with a verifiable + signature + `created_by` thumbprint. This mints a fresh 32-byte seed, + stores it in the keychain (unless --no-store), and prints the public-key + thumbprint — the signer identity `MemoryShard.verify()` checks against. + + Record the thumbprint somewhere durable. The seed itself is secret and is + NOT echoed by default after a keychain store — pass --print-seed (or use + --no-store) to reveal it when you need to mirror the identity onto the + headless us-ny1 builder via a ZEITGHOST_SIGNING_KEY env var. + """ + import os as _os + from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + from spiritwriter.fabric.shard import pubkey_thumbprint + from zeitghost.shards import SIGNING_KEY_NAME + + seed = _os.urandom(32) # a 32-byte Ed25519 seed + pub = (Ed25519PrivateKey.from_private_bytes(seed).public_key() + .public_bytes(encoding=serialization.Encoding.Raw, + format=serialization.PublicFormat.Raw)) + seed_hex = seed.hex() + console.print(f"[bold]Signer thumbprint:[/bold] {pubkey_thumbprint(pub)}") + + stored = False + if store_key: + from spiritwriter.secrets import configure, set_api_key + configure(service_name="zeitghost") + stored = set_api_key(SIGNING_KEY_NAME, seed_hex) + + if stored: + console.print(f"[green]Stored {SIGNING_KEY_NAME} in the OS keychain — " + f"the next `zeitghost ingest` will sign its shards.[/green]") + if print_seed: + console.print(f"[dim]seed (secret; for the prod env var): {seed_hex}[/dim]") + else: + console.print("[dim]Seed kept off-screen. Re-run with --print-seed " + "to reveal it for mirroring to prod.[/dim]") + else: + # Not stored (--no-store, or keychain unavailable): the printed seed is + # the only copy, so it must be shown regardless of --print-seed. + if store_key: + console.print("[yellow]Keychain unavailable — key NOT stored.[/yellow]") + console.print("Provision it yourself (e.g. on the us-ny1 builder):") + console.print(f" [bold]export {SIGNING_KEY_NAME}={seed_hex}[/bold]") + + @main.command(name="import-legacy") @click.option("--db-url", required=True, help="postgresql://user:pass@host:port/dbname (the temp pg " diff --git a/zeitghost/shards.py b/zeitghost/shards.py index ee4d894..ff2e3d8 100644 --- a/zeitghost/shards.py +++ b/zeitghost/shards.py @@ -25,6 +25,69 @@ SCOPE_INTERNAL = "zeitghost:article" SCOPE_SW_ARTICLE = "sw:article" +# Secret name (OS keychain key / env var) holding the 64-char-hex Ed25519 seed +# used to sign shards. Resolved via spiritwriter.secrets, which checks the +# keychain first then falls back to the environment — so the headless us-ny1 +# builder can be handed the seed through a ZEITGHOST_SIGNING_KEY env var. +SIGNING_KEY_NAME = "ZEITGHOST_SIGNING_KEY" + + +def resolve_signing_seed() -> bytes | None: + """Return the 32-byte Ed25519 signing seed, or None if none is configured. + + Signing is opt-in: an environment without `ZEITGHOST_SIGNING_KEY` set + (local dev, CI, a freshly-provisioned container) simply writes unsigned + shards rather than failing. Pass the result to `article_to_*_shard(..., + signing_seed=...)`. Generate a key with `zeitghost gen-signing-key`. + + Returns None — and logs a warning — when the configured value isn't a + valid 32-byte hex seed, so a fat-fingered key degrades to unsigned rather + than crashing ingest. + """ + # Lazy import keeps secrets/keyring off the module-import path + # (robustness invariant #2: no I/O at import). + from spiritwriter.secrets import configure, get_api_key + + configure(service_name="zeitghost") + raw = get_api_key(SIGNING_KEY_NAME) + if not raw: + return None + try: + seed = bytes.fromhex(raw.strip()) + except ValueError: + log.warning("%s is not valid hex — writing unsigned shards", + SIGNING_KEY_NAME) + return None + if len(seed) != 32: + log.warning("%s must decode to 32 bytes (got %d) — writing unsigned shards", + SIGNING_KEY_NAME, len(seed)) + return None + return seed + + +def signing_required(flag: bool = False) -> bool: + """Whether ingest must fail-closed when no signing key is configured. + + True if the `--require-signing` flag is passed OR `ZEITGHOST_REQUIRE_SIGNING` + is truthy. Prod (us-ny1) sets the env var once its `ZEITGHOST_SIGNING_KEY` + is provisioned, so an accidentally-cleared key fails the run loudly instead + of silently writing unsigned shards. Local dev and CI leave both unset, so + signing stays opt-in there. + """ + if flag: + return True + val = os.environ.get("ZEITGHOST_REQUIRE_SIGNING", "").strip().lower() + return val in ("1", "true", "yes", "on") + + +def _maybe_sign(shard: MemoryShard, signing_seed: bytes | None) -> None: + """Sign `shard` in place when a seed is supplied (sets `signature` and + `created_by`). The signature covers {atoms, scope, origin, …} but NOT the + content-address, so signing never changes `shard.shard_id` — safe to call + before `store.put` and to return the id afterwards.""" + if signing_seed: + shard.sign(signing_seed) + def _agent_string() -> str: """Identify the agent that wrote this shard: zeitghost version + @@ -55,18 +118,31 @@ def _url_entity(url: str) -> str: return f"article:{hashlib.sha256(url.encode()).hexdigest()}" +def _entity_of(shard: MemoryShard) -> str: + """Entity key for a shard: `meta['entity_key']` if present, else derived + from the `source_url` atom. Returns "" when neither is available. + + Single source of truth for the "which article does this shard describe?" + lookup shared by `known_url_entities`, `build_lineage_index`, and + `load_articles_from_shards` — keep them in agreement so dedup, lineage + chaining, and render-time collapse all key off the same identity. + """ + ent = shard.meta.get("entity_key", "") + if ent: + return ent + for atom in shard.atoms: + if atom.key == "source_url" and atom.entity: + return atom.entity + return "" + + def known_url_entities(store: ShardStore) -> set[str]: """Return entity keys (article:{hash}) already in the internal scope.""" seen: set[str] = set() for shard in store.by_scope(SCOPE_INTERNAL): - ent = shard.meta.get("entity_key", "") + ent = _entity_of(shard) if ent: seen.add(ent) - continue - for atom in shard.atoms: - if atom.key == "source_url" and atom.entity: - seen.add(atom.entity) - break return seen @@ -87,12 +163,7 @@ def build_lineage_index(store: ShardStore, scope: str) -> dict[str, str]: """ latest: dict[str, tuple[str, str]] = {} # entity → (shard_id, created_at) for shard in store.by_scope(scope): - ent = shard.meta.get("entity_key", "") - if not ent: - for atom in shard.atoms: - if atom.key == "source_url" and atom.entity: - ent = atom.entity - break + ent = _entity_of(shard) if not ent: continue existing = latest.get(ent) @@ -156,7 +227,8 @@ def _article_tags(article: AnalyzedArticle) -> list[str]: def article_to_internal_shard(article: AnalyzedArticle, store: ShardStore, - lineage_index: dict[str, str] | None = None + lineage_index: dict[str, str] | None = None, + signing_seed: bytes | None = None ) -> str: """Write the zeitghost-internal shard with full L/R variant data. @@ -251,15 +323,18 @@ def article_to_internal_shard(article: AnalyzedArticle, store: ShardStore, tags=_article_tags(article), meta={"entity_key": entity}, ) + _maybe_sign(shard, signing_seed) store.put(shard) - log.debug("Stored zeitghost shard %s for '%s'%s", + log.debug("Stored zeitghost shard %s for '%s'%s%s", shard.shard_id[:12], article.original.title[:40], - f" (revision of {parent[:12]})" if parent else "") + f" (revision of {parent[:12]})" if parent else "", + " [signed]" if shard.signature else "") return shard.shard_id def article_to_sw_shard(article: AnalyzedArticle, store: ShardStore, - lineage_index: dict[str, str] | None = None) -> str: + lineage_index: dict[str, str] | None = None, + signing_seed: bytes | None = None) -> str: """Write the consumer-agnostic sw:article shard. Atom keys (`title`, `summary`, etc.) match frio's `shard_from_article()`. @@ -311,6 +386,7 @@ def article_to_sw_shard(article: AnalyzedArticle, store: ShardStore, tags=_article_tags(article), meta={"entity_key": entity}, ) + _maybe_sign(shard, signing_seed) store.put(shard) return shard.shard_id @@ -389,9 +465,36 @@ def _shard_to_article(shard: MemoryShard) -> AnalyzedArticle | None: def load_articles_from_shards(store: ShardStore) -> list[AnalyzedArticle]: - """Reconstruct AnalyzedArticle objects from all zeitghost:article shards.""" - out = [] + """Reconstruct AnalyzedArticle objects — one per entity, newest revision. + + Re-analysing an article writes a new shard linked to the prior one via + `parent_shard_id` (see `build_lineage_index`), so the store accumulates a + revision chain per article. The renderer wants the *current* state, so we + collapse each chain to its newest shard here — otherwise a re-analyzed + article would surface as two cards. Latest-wins by `created_at`, matching + the selection `build_lineage_index` uses when picking parents. + + Shards with no resolvable entity key (neither `meta['entity_key']` nor a + `source_url` atom) can't be deduped, so they're passed through individually + rather than dropped. + """ + latest: dict[str, MemoryShard] = {} + orphans: list[MemoryShard] = [] for shard in store.by_scope(SCOPE_INTERNAL): + ent = _entity_of(shard) + if not ent: + orphans.append(shard) + continue + cur = latest.get(ent) + # Strict `>` means equal timestamps (sub-second collision, or both "") + # keep the first shard `by_scope` yields — ties go to first-seen. This + # matches build_lineage_index's comparison, so the "latest" rendered + # here is the same shard its parent-chaining treats as the head. + if cur is None or (shard.created_at or "") > (cur.created_at or ""): + latest[ent] = shard + + out = [] + for shard in (*latest.values(), *orphans): a = _shard_to_article(shard) if a: out.append(a)