From 00018eebab3cbc716fc9a83792f558e0c6381334 Mon Sep 17 00:00:00 2001 From: Aaron Markham Date: Sat, 30 May 2026 14:03:18 -0700 Subject: [PATCH 1/4] =?UTF-8?q?shards:=20integrity=20&=20lineage-correctne?= =?UTF-8?q?ss=20=E2=80=94=20sign=20shards,=20collapse=20revisions,=20never?= =?UTF-8?q?=20default-fill=20bias?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes to how zeitghost uses spiritwriter's shard substrate: - Opt-in Ed25519 signing. article_to_*_shard now accept a signing_seed and sign before store.put when one is configured; resolve_signing_seed() reads ZEITGHOST_SIGNING_KEY via spiritwriter.secrets (keychain → env fallback, so the headless us-ny1 builder can use an env var). Unconfigured environments write unsigned shards rather than failing. New `zeitghost gen-signing-key` mints a seed, stores it in the keychain, and prints the signer thumbprint. Signing covers {atoms, scope, origin, …} but not the content-address, so shard_id is unchanged. - load_articles_from_shards collapses each parent_shard_id revision chain to its newest shard per entity. Previously every shard in scope was rendered, so a re-analyzed article would surface as duplicate cards the moment a chain formed — a latent bug that became reachable once re-analysis is enabled. - analyze_article skips an article whose LLM response lacks a usable bias_score instead of defaulting to 0.5, which would mislabel an unscored article as "center" (robustness invariant #1). Logic extracted to a testable _parse_bias_score helper; a literal 0.0 (full left) still passes through. Factored the shared entity-key lookup into _entity_of (used by known_url_entities, build_lineage_index, and the new load path) so dedup, lineage chaining, and render-time collapse all key off the same identity. Tests: new tests/test_shard_integrity.py covers signing round-trip + verify, unsigned-by-default, latest-per-entity collapse, distinct-entity separation, the bias-score guard, and resolve_signing_seed (env/absent/malformed). Updated test_shard_metadata_round_trip for the latest-only load behavior. Not in scope (rides with the trace-emitter PR): shard_superseded trace events, trace_ref population, and surfacing the signer in the card flip-panel. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_shard_integrity.py | 157 ++++++++++++++++++++++++++++++++++ tests/test_smoke.py | 11 ++- zeitghost/bias.py | 28 +++++- zeitghost/cli.py | 62 +++++++++++++- zeitghost/shards.py | 120 ++++++++++++++++++++++---- 5 files changed, 353 insertions(+), 25 deletions(-) create mode 100644 tests/test_shard_integrity.py diff --git a/tests/test_shard_integrity.py b/tests/test_shard_integrity.py new file mode 100644 index 0000000..fcc69f7 --- /dev/null +++ b/tests/test_shard_integrity.py @@ -0,0 +1,157 @@ +"""Shard integrity & lineage-correctness tests. + +Covers three behaviors: +- #5 bias_score is never default-filled (skip an unscored article). +- #2 load collapses a revision chain to the latest shard per entity. +- #4 shards are signed when (and only when) a signing seed is configured. +""" + +import os + +import pytest + +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + +from zeitghost.bias import AnalyzedArticle +from zeitghost.fetcher import Article + + +def _article(url, score=0.5): + return AnalyzedArticle( + original=Article(title="T", url=url, summary="S", source_name="AP", + published="2026-05-12T00:00:00+00:00", + categories=["politics"]), + bias_score=score, bias_label="center", + variant_left_title="L", variant_left_summary="L sum", + variant_right_title="R", variant_right_summary="R sum", + ) + + +def _pubkey(seed: bytes) -> bytes: + return (Ed25519PrivateKey.from_private_bytes(seed).public_key() + .public_bytes(encoding=serialization.Encoding.Raw, + format=serialization.PublicFormat.Raw)) + + +# --- #5: bias_score skip-not-default -------------------------------------- + +def test_parse_bias_score_skips_missing_not_defaults(): + """A missing or non-numeric score yields None (caller skips) — never a + silent 0.5 that would mislabel an unscored article as 'center'.""" + from zeitghost.bias import _parse_bias_score + + # Present and numeric — passes through, including the valid 0.0 edge. + assert _parse_bias_score({"bias_score": 0.0}) == 0.0 + assert _parse_bias_score({"bias_score": 0.73}) == pytest.approx(0.73) + assert _parse_bias_score({"bias_score": "0.7"}) == pytest.approx(0.7) + # Absent / null / unparseable — None, so analyze_article returns None. + assert _parse_bias_score({}) is None + assert _parse_bias_score({"bias_score": None}) is None + assert _parse_bias_score({"bias_score": "left-ish"}) is None + assert _parse_bias_score({"bias_score": {}}) is None + + +# --- #2: load collapses revision chains to latest -------------------------- + +def test_load_returns_latest_revision_only(tmp_path): + """Two revisions of the same article (chained via parent_shard_id) collapse + to a single card on load — the newest one.""" + from zeitghost.shards import ( + init_store, article_to_internal_shard, load_articles_from_shards, + build_lineage_index, SCOPE_INTERNAL, + ) + store = init_store(tmp_path / "shards") + url = "https://e.com/evolving-story" + + a = _article(url, score=0.30) + first = article_to_internal_shard(a, store) + + a2 = _article(url, score=0.80) # re-analysis, same URL → same entity + lineage = build_lineage_index(store, SCOPE_INTERNAL) + second = article_to_internal_shard(a2, store, lineage_index=lineage) + assert second != first + + loaded = load_articles_from_shards(store) + assert len(loaded) == 1 + assert loaded[0].shard_id == second + assert loaded[0].parent_shard_id == first + assert loaded[0].bias_score == pytest.approx(0.80) + + +def test_load_keeps_distinct_entities_separate(tmp_path): + """Collapse is per-entity — two different articles both load.""" + from zeitghost.shards import ( + init_store, article_to_internal_shard, load_articles_from_shards, + ) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/a"), store) + article_to_internal_shard(_article("https://e.com/b"), store) + + loaded = load_articles_from_shards(store) + assert {a.original.url for a in loaded} == {"https://e.com/a", "https://e.com/b"} + + +# --- #4: opt-in signing ---------------------------------------------------- + +def test_shard_unsigned_without_seed(tmp_path): + from spiritwriter.fabric.store import ShardStore # noqa: F401 (type clarity) + from zeitghost.shards import ( + init_store, article_to_internal_shard, SCOPE_INTERNAL, + ) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/unsigned"), store) + + [shard] = list(store.by_scope(SCOPE_INTERNAL)) + assert shard.signature is None + assert shard.created_by is None + + +def test_shard_signed_with_seed_verifies_and_round_trips(tmp_path): + """A signed shard persists its signature + created_by, and the signature + verifies against the seed's public key after a store round-trip.""" + from spiritwriter.fabric.shard import pubkey_thumbprint + from zeitghost.shards import ( + init_store, article_to_internal_shard, article_to_sw_shard, + SCOPE_INTERNAL, SCOPE_SW_ARTICLE, + ) + seed = os.urandom(32) + pub = _pubkey(seed) + store = init_store(tmp_path / "shards") + + article_to_internal_shard(_article("https://e.com/signed"), store, + signing_seed=seed) + article_to_sw_shard(_article("https://e.com/signed"), store, + signing_seed=seed) + + for scope in (SCOPE_INTERNAL, SCOPE_SW_ARTICLE): + [shard] = list(store.by_scope(scope)) + assert shard.signature is not None + assert shard.created_by == pubkey_thumbprint(pub) + # verify() raises on a bad signature; True means the chain holds. + assert shard.verify(pub) is True + + +# --- resolve_signing_seed -------------------------------------------------- + +def test_resolve_signing_seed_from_env(monkeypatch): + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + seed = os.urandom(32) + monkeypatch.setenv(SIGNING_KEY_NAME, seed.hex()) + assert resolve_signing_seed() == seed + + +def test_resolve_signing_seed_absent_is_none(monkeypatch): + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + monkeypatch.delenv(SIGNING_KEY_NAME, raising=False) + # No env var (and the test keychain won't have this key) → opt-out. + assert resolve_signing_seed() is None + + +def test_resolve_signing_seed_malformed_is_none(monkeypatch): + """A fat-fingered key degrades to unsigned rather than crashing ingest.""" + from zeitghost.shards import resolve_signing_seed, SIGNING_KEY_NAME + monkeypatch.setenv(SIGNING_KEY_NAME, "not-hex-at-all") + assert resolve_signing_seed() is None + monkeypatch.setenv(SIGNING_KEY_NAME, "ab") # valid hex, but 1 byte ≠ 32 + assert resolve_signing_seed() is None diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e18b6ed..be04979 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -474,10 +474,15 @@ def test_shard_metadata_round_trip(tmp_path): shard_id_2 = article_to_internal_shard(a, store, lineage_index=lineage) assert shard_id_2 != shard_id_1 - # Loading now returns both revisions; the newer one points at the older + # Loading collapses the revision chain to the latest shard per entity, so + # the re-analyzed article renders as ONE card (its newest revision), not + # two — while still carrying the parent link back to the prior shard. revisions = load_articles_from_shards(store) - by_id = {r.shard_id: r for r in revisions} - assert by_id[shard_id_2].parent_shard_id == shard_id_1 + assert len(revisions) == 1 + latest = revisions[0] + assert latest.shard_id == shard_id_2 + assert latest.parent_shard_id == shard_id_1 + assert latest.bias_score == pytest.approx(0.74) def test_legacy_dump_helpers_smoke(): diff --git a/zeitghost/bias.py b/zeitghost/bias.py index fa9f414..38b7799 100644 --- a/zeitghost/bias.py +++ b/zeitghost/bias.py @@ -172,6 +172,24 @@ def bias_lean_display(score: float, center_tolerance: float = 0.025) -> str: """ +def _parse_bias_score(data: dict) -> float | None: + """Bias score from the LLM JSON, or None if absent/non-numeric. + + Returns None rather than defaulting to 0.5 — an article we couldn't score + must be *skipped*, never silently published as "center" (robustness + invariant #1: skip or guard, never default-fill). A literal 0.0 (full + left) is a valid score and passes through; only a missing or unparseable + value yields None. + """ + raw = data.get("bias_score") + if raw is None: + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + def _extract_json(text: str) -> dict | None: """Extract first valid JSON object from LLM response. @@ -221,11 +239,19 @@ async def analyze_article(article: Article) -> AnalyzedArticle | None: log.debug("Raw response: %s", response[:500]) return None + bias_score = _parse_bias_score(data) + if bias_score is None: + # No usable score → skip rather than default-fill to 0.5, which + # would mislabel an unscored article as "center" (invariant #1). + log.warning("Missing/invalid bias_score for '%s' — skipping", + article.title[:50]) + return None + left = data.get("variant_left", {}) or {} right = data.get("variant_right", {}) or {} return AnalyzedArticle( original=article, - bias_score=float(data.get("bias_score", 0.5)), + bias_score=bias_score, bias_label=data.get("bias_label", "center"), variant_left_title=left.get("title", article.title), variant_left_summary=left.get("summary", article.summary), diff --git a/zeitghost/cli.py b/zeitghost/cli.py index fbc7bd1..c8d3a1f 100644 --- a/zeitghost/cli.py +++ b/zeitghost/cli.py @@ -42,7 +42,7 @@ def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): from zeitghost.bias import analyze_batch from zeitghost.shards import (init_store, known_url_entities, is_known, article_to_internal_shard, article_to_sw_shard, - build_lineage_index, + build_lineage_index, resolve_signing_seed, SCOPE_INTERNAL, SCOPE_SW_ARTICLE) store = init_store() @@ -84,9 +84,17 @@ def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): # default, but possible if dedup is bypassed later) chain via parent_shard_id. internal_lineage = build_lineage_index(store, SCOPE_INTERNAL) sw_lineage = build_lineage_index(store, SCOPE_SW_ARTICLE) + # Sign shards with the Ed25519 provenance key when one is configured. + # Opt-in: unconfigured environments write unsigned shards (see + # resolve_signing_seed / `zeitghost gen-signing-key`). + seed = resolve_signing_seed() + console.print(" Signing shards (ZEITGHOST_SIGNING_KEY configured)" + if seed else " [dim]No signing key — writing unsigned shards[/dim]") for a in analyzed: - article_to_internal_shard(a, store, lineage_index=internal_lineage) - article_to_sw_shard(a, store, lineage_index=sw_lineage) + article_to_internal_shard(a, store, lineage_index=internal_lineage, + signing_seed=seed) + article_to_sw_shard(a, store, lineage_index=sw_lineage, + signing_seed=seed) console.print(f" {len(analyzed) * 2} shards written " f"(internal + sw:article)") @@ -152,6 +160,54 @@ def analytics(output: str): console.print(f"[green]Analytics page generated at {path}[/green]") +@main.command(name="gen-signing-key") +@click.option("--store/--no-store", "store_key", default=True, + help="Store the key in the OS keychain (default). --no-store " + "only prints it, for manual provisioning on a headless host.") +def gen_signing_key(store_key: bool): + """Generate an Ed25519 key for signing shards' provenance. + + Shards written by `zeitghost ingest` are signed whenever ZEITGHOST_SIGNING_KEY + is resolvable (OS keychain or env var), stamping each with a verifiable + signature + `created_by` thumbprint. This mints a fresh 32-byte seed, + stores it in the keychain (unless --no-store), and prints the public-key + thumbprint — the signer identity `MemoryShard.verify()` checks against. + + Record the thumbprint somewhere durable. The seed itself is secret: to run + the same identity on the headless us-ny1 builder, copy the printed seed into + a ZEITGHOST_SIGNING_KEY env var there (the keychain isn't available in the + container). + """ + import os as _os + from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + from spiritwriter.fabric.shard import pubkey_thumbprint + from zeitghost.shards import SIGNING_KEY_NAME + + seed = _os.urandom(32) # a 32-byte Ed25519 seed + pub = (Ed25519PrivateKey.from_private_bytes(seed).public_key() + .public_bytes(encoding=serialization.Encoding.Raw, + format=serialization.PublicFormat.Raw)) + seed_hex = seed.hex() + console.print(f"[bold]Signer thumbprint:[/bold] {pubkey_thumbprint(pub)}") + + stored = False + if store_key: + from spiritwriter.secrets import configure, set_api_key + configure(service_name="zeitghost") + stored = set_api_key(SIGNING_KEY_NAME, seed_hex) + + if stored: + console.print(f"[green]Stored {SIGNING_KEY_NAME} in the OS keychain — " + f"the next `zeitghost ingest` will sign its shards.[/green]") + console.print(f"[dim]seed (secret; for the prod env var): {seed_hex}[/dim]") + else: + if store_key: + console.print("[yellow]Keychain unavailable — key NOT stored.[/yellow]") + console.print("Provision it yourself (e.g. on the us-ny1 builder):") + console.print(f" [bold]export {SIGNING_KEY_NAME}={seed_hex}[/bold]") + + @main.command(name="import-legacy") @click.option("--db-url", required=True, help="postgresql://user:pass@host:port/dbname (the temp pg " diff --git a/zeitghost/shards.py b/zeitghost/shards.py index ee4d894..c39552b 100644 --- a/zeitghost/shards.py +++ b/zeitghost/shards.py @@ -25,6 +25,54 @@ SCOPE_INTERNAL = "zeitghost:article" SCOPE_SW_ARTICLE = "sw:article" +# Secret name (OS keychain key / env var) holding the 64-char-hex Ed25519 seed +# used to sign shards. Resolved via spiritwriter.secrets, which checks the +# keychain first then falls back to the environment — so the headless us-ny1 +# builder can be handed the seed through a ZEITGHOST_SIGNING_KEY env var. +SIGNING_KEY_NAME = "ZEITGHOST_SIGNING_KEY" + + +def resolve_signing_seed() -> bytes | None: + """Return the 32-byte Ed25519 signing seed, or None if none is configured. + + Signing is opt-in: an environment without `ZEITGHOST_SIGNING_KEY` set + (local dev, CI, a freshly-provisioned container) simply writes unsigned + shards rather than failing. Pass the result to `article_to_*_shard(..., + signing_seed=...)`. Generate a key with `zeitghost gen-signing-key`. + + Returns None — and logs a warning — when the configured value isn't a + valid 32-byte hex seed, so a fat-fingered key degrades to unsigned rather + than crashing ingest. + """ + # Lazy import keeps secrets/keyring off the module-import path + # (robustness invariant #2: no I/O at import). + from spiritwriter.secrets import configure, get_api_key + + configure(service_name="zeitghost") + raw = get_api_key(SIGNING_KEY_NAME) + if not raw: + return None + try: + seed = bytes.fromhex(raw.strip()) + except ValueError: + log.warning("%s is not valid hex — writing unsigned shards", + SIGNING_KEY_NAME) + return None + if len(seed) != 32: + log.warning("%s must decode to 32 bytes (got %d) — writing unsigned shards", + SIGNING_KEY_NAME, len(seed)) + return None + return seed + + +def _maybe_sign(shard: MemoryShard, signing_seed: bytes | None) -> None: + """Sign `shard` in place when a seed is supplied (sets `signature` and + `created_by`). The signature covers {atoms, scope, origin, …} but NOT the + content-address, so signing never changes `shard.shard_id` — safe to call + before `store.put` and to return the id afterwards.""" + if signing_seed: + shard.sign(signing_seed) + def _agent_string() -> str: """Identify the agent that wrote this shard: zeitghost version + @@ -55,18 +103,31 @@ def _url_entity(url: str) -> str: return f"article:{hashlib.sha256(url.encode()).hexdigest()}" +def _entity_of(shard: MemoryShard) -> str: + """Entity key for a shard: `meta['entity_key']` if present, else derived + from the `source_url` atom. Returns "" when neither is available. + + Single source of truth for the "which article does this shard describe?" + lookup shared by `known_url_entities`, `build_lineage_index`, and + `load_articles_from_shards` — keep them in agreement so dedup, lineage + chaining, and render-time collapse all key off the same identity. + """ + ent = shard.meta.get("entity_key", "") + if ent: + return ent + for atom in shard.atoms: + if atom.key == "source_url" and atom.entity: + return atom.entity + return "" + + def known_url_entities(store: ShardStore) -> set[str]: """Return entity keys (article:{hash}) already in the internal scope.""" seen: set[str] = set() for shard in store.by_scope(SCOPE_INTERNAL): - ent = shard.meta.get("entity_key", "") + ent = _entity_of(shard) if ent: seen.add(ent) - continue - for atom in shard.atoms: - if atom.key == "source_url" and atom.entity: - seen.add(atom.entity) - break return seen @@ -87,12 +148,7 @@ def build_lineage_index(store: ShardStore, scope: str) -> dict[str, str]: """ latest: dict[str, tuple[str, str]] = {} # entity → (shard_id, created_at) for shard in store.by_scope(scope): - ent = shard.meta.get("entity_key", "") - if not ent: - for atom in shard.atoms: - if atom.key == "source_url" and atom.entity: - ent = atom.entity - break + ent = _entity_of(shard) if not ent: continue existing = latest.get(ent) @@ -156,7 +212,8 @@ def _article_tags(article: AnalyzedArticle) -> list[str]: def article_to_internal_shard(article: AnalyzedArticle, store: ShardStore, - lineage_index: dict[str, str] | None = None + lineage_index: dict[str, str] | None = None, + signing_seed: bytes | None = None ) -> str: """Write the zeitghost-internal shard with full L/R variant data. @@ -251,15 +308,18 @@ def article_to_internal_shard(article: AnalyzedArticle, store: ShardStore, tags=_article_tags(article), meta={"entity_key": entity}, ) + _maybe_sign(shard, signing_seed) store.put(shard) - log.debug("Stored zeitghost shard %s for '%s'%s", + log.debug("Stored zeitghost shard %s for '%s'%s%s", shard.shard_id[:12], article.original.title[:40], - f" (revision of {parent[:12]})" if parent else "") + f" (revision of {parent[:12]})" if parent else "", + " [signed]" if shard.signature else "") return shard.shard_id def article_to_sw_shard(article: AnalyzedArticle, store: ShardStore, - lineage_index: dict[str, str] | None = None) -> str: + lineage_index: dict[str, str] | None = None, + signing_seed: bytes | None = None) -> str: """Write the consumer-agnostic sw:article shard. Atom keys (`title`, `summary`, etc.) match frio's `shard_from_article()`. @@ -311,6 +371,7 @@ def article_to_sw_shard(article: AnalyzedArticle, store: ShardStore, tags=_article_tags(article), meta={"entity_key": entity}, ) + _maybe_sign(shard, signing_seed) store.put(shard) return shard.shard_id @@ -389,9 +450,32 @@ def _shard_to_article(shard: MemoryShard) -> AnalyzedArticle | None: def load_articles_from_shards(store: ShardStore) -> list[AnalyzedArticle]: - """Reconstruct AnalyzedArticle objects from all zeitghost:article shards.""" - out = [] + """Reconstruct AnalyzedArticle objects — one per entity, newest revision. + + Re-analysing an article writes a new shard linked to the prior one via + `parent_shard_id` (see `build_lineage_index`), so the store accumulates a + revision chain per article. The renderer wants the *current* state, so we + collapse each chain to its newest shard here — otherwise a re-analyzed + article would surface as two cards. Latest-wins by `created_at`, matching + the selection `build_lineage_index` uses when picking parents. + + Shards with no resolvable entity key (neither `meta['entity_key']` nor a + `source_url` atom) can't be deduped, so they're passed through individually + rather than dropped. + """ + latest: dict[str, MemoryShard] = {} + orphans: list[MemoryShard] = [] for shard in store.by_scope(SCOPE_INTERNAL): + ent = _entity_of(shard) + if not ent: + orphans.append(shard) + continue + cur = latest.get(ent) + if cur is None or (shard.created_at or "") > (cur.created_at or ""): + latest[ent] = shard + + out = [] + for shard in (*latest.values(), *orphans): a = _shard_to_article(shard) if a: out.append(a) From 13c2f5000e40caeb12e5284a3e4387df472d3164 Mon Sep 17 00:00:00 2001 From: Aaron Markham Date: Sat, 30 May 2026 14:28:44 -0700 Subject: [PATCH 2/4] review: gate seed echo behind --print-seed, report no-score skips, harden tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #5 review: - gen-signing-key no longer echoes the secret seed after a successful keychain store. New --print-seed flag (or --no-store, which must print since it's the only copy) reveals it for mirroring to the prod env var; otherwise it stays off-screen so it doesn't linger in shell scrollback/history. - analyze_batch now logs a WARNING with the count of articles dropped for a missing/invalid bias_score (threaded via a stats dict from analyze_article). The new skip-not-default path silently shrinks the feed by design, so the count surfaces an LLM regression; logging reaches the operator's console via the CLI RichHandler. - load_articles_from_shards: comment that equal-timestamp ties go to first-seen, consistent with build_lineage_index's comparison. - ingest only prints the signing-mode line when there's something to write. - Tests: tampered-signature → verify() raises InvalidSignature; end-to-end analyze_article returns None when the LLM response omits bias_score (mocked provider). 48 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_shard_integrity.py | 44 +++++++++++++++++++++++++++++++++++ zeitghost/bias.py | 28 ++++++++++++++++++---- zeitghost/cli.py | 27 ++++++++++++++------- zeitghost/shards.py | 4 ++++ 4 files changed, 91 insertions(+), 12 deletions(-) diff --git a/tests/test_shard_integrity.py b/tests/test_shard_integrity.py index fcc69f7..79700e3 100644 --- a/tests/test_shard_integrity.py +++ b/tests/test_shard_integrity.py @@ -132,6 +132,50 @@ def test_shard_signed_with_seed_verifies_and_round_trips(tmp_path): assert shard.verify(pub) is True +def test_tampered_signature_fails_verify(tmp_path): + """Flipping a byte of the signature must make verify() reject it — guards + against a future regression that signs the wrong payload.""" + from cryptography.exceptions import InvalidSignature + from zeitghost.shards import ( + init_store, article_to_internal_shard, SCOPE_INTERNAL, + ) + seed = os.urandom(32) + pub = _pubkey(seed) + store = init_store(tmp_path / "shards") + article_to_internal_shard(_article("https://e.com/tamper"), store, + signing_seed=seed) + + [shard] = list(store.by_scope(SCOPE_INTERNAL)) + # Corrupt one hex nibble of the signature (wraps so 'f' stays valid hex). + sig = shard.signature + flipped = ("e" if sig[0] != "e" else "d") + sig[1:] + shard.signature = flipped + with pytest.raises(InvalidSignature): + shard.verify(pub) + + +# --- #5 end-to-end: analyze_article drops an unscored article -------------- + +def test_analyze_article_returns_none_when_bias_score_missing(monkeypatch): + """End-to-end: when the LLM response omits bias_score, analyze_article + returns None (skip) rather than constructing a default-0.5 article.""" + import asyncio + import zeitghost.bias as bias + + class _FakeProvider: + async def query(self, prompt, model=None): + # Valid JSON, variants present, but NO bias_score key. + return ('{"bias_label": "center", ' + '"variant_left": {"title": "L", "summary": "ls"}, ' + '"variant_right": {"title": "R", "summary": "rs"}}') + + monkeypatch.setattr(bias, "_get_provider", lambda: _FakeProvider()) + + art = Article(title="T", url="https://e.com/no-score", summary="s", + source_name="src", published="2026-05-12T00:00:00+00:00") + assert asyncio.run(bias.analyze_article(art)) is None + + # --- resolve_signing_seed -------------------------------------------------- def test_resolve_signing_seed_from_env(monkeypatch): diff --git a/zeitghost/bias.py b/zeitghost/bias.py index 38b7799..0d0afaa 100644 --- a/zeitghost/bias.py +++ b/zeitghost/bias.py @@ -219,8 +219,14 @@ def _extract_json(text: str) -> dict | None: return None -async def analyze_article(article: Article) -> AnalyzedArticle | None: - """Analyze one article — compute bias and generate both variants.""" +async def analyze_article(article: Article, + stats: dict | None = None) -> AnalyzedArticle | None: + """Analyze one article — compute bias and generate both variants. + + Pass a mutable `stats` dict to tally why articles are dropped — currently + the `no_score` reason (skipped for a missing/invalid bias_score). Lets + `analyze_batch` surface the count so a silent feed-drop is noticeable. + """ provider = _get_provider() # Prefer the trafilatura-extracted body when present (richer context for # Claude); fall back to NewsAPI's terse description if body fetch failed. @@ -245,6 +251,8 @@ async def analyze_article(article: Article) -> AnalyzedArticle | None: # would mislabel an unscored article as "center" (invariant #1). log.warning("Missing/invalid bias_score for '%s' — skipping", article.title[:50]) + if stats is not None: + stats["no_score"] = stats.get("no_score", 0) + 1 return None left = data.get("variant_left", {}) or {} @@ -268,11 +276,23 @@ async def analyze_article(article: Article) -> AnalyzedArticle | None: async def analyze_batch(articles: list[Article]) -> list[AnalyzedArticle]: - """Analyze a batch of articles. Skips any that fail to parse.""" + """Analyze a batch of articles. Skips any that fail to parse. + + Logs a WARNING with the count of articles dropped for a missing/invalid + bias_score — that path silently shrinks the feed (by design, vs. the old + default-fill), so the count surfaces an LLM regression that starts dropping + a chunk of articles. Logging propagates to the CLI's RichHandler, so it + shows on the operator's console during `zeitghost ingest`. + """ results = [] + stats: dict[str, int] = {} for article in articles: - analyzed = await analyze_article(article) + analyzed = await analyze_article(article, stats=stats) if analyzed is not None: results.append(analyzed) log.info("Analyzed %d articles, %d succeeded", len(articles), len(results)) + n_unscored = stats.get("no_score", 0) + if n_unscored: + log.warning("%d article(s) skipped — no usable bias_score in the LLM " + "response (not default-filled to center)", n_unscored) return results diff --git a/zeitghost/cli.py b/zeitghost/cli.py index c8d3a1f..ff0b857 100644 --- a/zeitghost/cli.py +++ b/zeitghost/cli.py @@ -88,8 +88,9 @@ def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): # Opt-in: unconfigured environments write unsigned shards (see # resolve_signing_seed / `zeitghost gen-signing-key`). seed = resolve_signing_seed() - console.print(" Signing shards (ZEITGHOST_SIGNING_KEY configured)" - if seed else " [dim]No signing key — writing unsigned shards[/dim]") + if analyzed: + console.print(" Signing shards (ZEITGHOST_SIGNING_KEY configured)" + if seed else " [dim]No signing key — writing unsigned shards[/dim]") for a in analyzed: article_to_internal_shard(a, store, lineage_index=internal_lineage, signing_seed=seed) @@ -164,7 +165,11 @@ def analytics(output: str): @click.option("--store/--no-store", "store_key", default=True, help="Store the key in the OS keychain (default). --no-store " "only prints it, for manual provisioning on a headless host.") -def gen_signing_key(store_key: bool): +@click.option("--print-seed", is_flag=True, + help="Also echo the secret seed after a successful keychain store " + "(for mirroring to the prod env var). --no-store always " + "prints it; otherwise the seed stays off-screen by default.") +def gen_signing_key(store_key: bool, print_seed: bool): """Generate an Ed25519 key for signing shards' provenance. Shards written by `zeitghost ingest` are signed whenever ZEITGHOST_SIGNING_KEY @@ -173,10 +178,10 @@ def gen_signing_key(store_key: bool): stores it in the keychain (unless --no-store), and prints the public-key thumbprint — the signer identity `MemoryShard.verify()` checks against. - Record the thumbprint somewhere durable. The seed itself is secret: to run - the same identity on the headless us-ny1 builder, copy the printed seed into - a ZEITGHOST_SIGNING_KEY env var there (the keychain isn't available in the - container). + Record the thumbprint somewhere durable. The seed itself is secret and is + NOT echoed by default after a keychain store — pass --print-seed (or use + --no-store) to reveal it when you need to mirror the identity onto the + headless us-ny1 builder via a ZEITGHOST_SIGNING_KEY env var. """ import os as _os from cryptography.hazmat.primitives import serialization @@ -200,8 +205,14 @@ def gen_signing_key(store_key: bool): if stored: console.print(f"[green]Stored {SIGNING_KEY_NAME} in the OS keychain — " f"the next `zeitghost ingest` will sign its shards.[/green]") - console.print(f"[dim]seed (secret; for the prod env var): {seed_hex}[/dim]") + if print_seed: + console.print(f"[dim]seed (secret; for the prod env var): {seed_hex}[/dim]") + else: + console.print("[dim]Seed kept off-screen. Re-run with --print-seed " + "to reveal it for mirroring to prod.[/dim]") else: + # Not stored (--no-store, or keychain unavailable): the printed seed is + # the only copy, so it must be shown regardless of --print-seed. if store_key: console.print("[yellow]Keychain unavailable — key NOT stored.[/yellow]") console.print("Provision it yourself (e.g. on the us-ny1 builder):") diff --git a/zeitghost/shards.py b/zeitghost/shards.py index c39552b..1952e25 100644 --- a/zeitghost/shards.py +++ b/zeitghost/shards.py @@ -471,6 +471,10 @@ def load_articles_from_shards(store: ShardStore) -> list[AnalyzedArticle]: orphans.append(shard) continue cur = latest.get(ent) + # Strict `>` means equal timestamps (sub-second collision, or both "") + # keep the first shard `by_scope` yields — ties go to first-seen. This + # matches build_lineage_index's comparison, so the "latest" rendered + # here is the same shard its parent-chaining treats as the head. if cur is None or (shard.created_at or "") > (cur.created_at or ""): latest[ent] = shard From 55114c8a91aba5baaa2936c011d288308b9d67bc Mon Sep 17 00:00:00 2001 From: Aaron Markham Date: Sat, 30 May 2026 21:04:34 -0700 Subject: [PATCH 3/4] signing: add fail-closed --require-signing mode + wire key through Ansible/compose Follow-up #1 from the PR #5 review: let prod fail-closed when signing is required but no key is configured, so an accidentally-cleared key surfaces loudly instead of silently degrading to unsigned shards. - signing_required(flag) (testable, no Click): True if --require-signing is passed or ZEITGHOST_REQUIRE_SIGNING is truthy. Local/CI leave both unset, so signing stays opt-in there. - ingest resolves the seed up front and fails fast (click.ClickException, exit 1) before spending any NewsAPI quota / Claude calls when required-but-missing. The single resolved seed is reused for the write loop (no double lookup). - env.j2: template ZEITGHOST_SIGNING_KEY from vault_zeitghost_signing_key (defaulted '' so the deploy renders before the key is vaulted) and ZEITGHOST_REQUIRE_SIGNING from zeitghost_require_signing (default 0). - docker-compose builder: pass both through (references only; secret stays in the 0600 .env Ansible renders). Safe rollout: require defaults OFF. Provision the vault key, deploy, confirm shards sign, then flip `zeitghost_require_signing: 1` in the inventory. Tests: signing_required off-by-default / flag-wins / env truthy+falsy variants. 51 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- infra/ansible/templates/env.j2 | 8 ++++++++ infra/docker/docker-compose.yml | 4 ++++ tests/test_shard_integrity.py | 25 +++++++++++++++++++++++++ zeitghost/cli.py | 26 +++++++++++++++++++++----- zeitghost/shards.py | 15 +++++++++++++++ 5 files changed, 73 insertions(+), 5 deletions(-) diff --git a/infra/ansible/templates/env.j2 b/infra/ansible/templates/env.j2 index 8ba9c55..6ab35f2 100644 --- a/infra/ansible/templates/env.j2 +++ b/infra/ansible/templates/env.j2 @@ -4,6 +4,14 @@ COMPOSE_PROJECT_NAME=zeitghost ANTHROPIC_API_KEY={{ vault_anthropic_api_key }} NEWS_API_KEY={{ vault_news_api_key }} +# Ed25519 seed (64 hex chars) for signing shards — mint with +# `zeitghost gen-signing-key`, store in the vault as vault_zeitghost_signing_key. +# Empty until provisioned (defaulted so the deploy still renders). +ZEITGHOST_SIGNING_KEY={{ vault_zeitghost_signing_key | default('') }} +# Set zeitghost_require_signing: 1 in the inventory to fail-close ingest once +# the key above is in the vault and verified. Off by default so rolling this +# out doesn't break ingest before the key is provisioned. +ZEITGHOST_REQUIRE_SIGNING={{ zeitghost_require_signing | default(0) }} ZEITGHOST_FEEDS={{ zeitghost_feeds | default('feeds/newsapi.yaml') }} ZEITGHOST_INTERVAL={{ zeitghost_interval | default(3600) }} ZEITGHOST_INGEST_LIMIT={{ zeitghost_ingest_limit | default(50) }} diff --git a/infra/docker/docker-compose.yml b/infra/docker/docker-compose.yml index 2ef2562..8e6391d 100644 --- a/infra/docker/docker-compose.yml +++ b/infra/docker/docker-compose.yml @@ -34,6 +34,10 @@ services: environment: ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} NEWS_API_KEY: ${NEWS_API_KEY:-} + # Shard signing — seed (secret) + fail-closed switch. Values live in .env + # (rendered 0600 by Ansible from the vault); only the references are here. + ZEITGHOST_SIGNING_KEY: ${ZEITGHOST_SIGNING_KEY:-} + ZEITGHOST_REQUIRE_SIGNING: ${ZEITGHOST_REQUIRE_SIGNING:-0} ZEITGHOST_FEEDS: ${ZEITGHOST_FEEDS:-feeds/newsapi.yaml} ZEITGHOST_INTERVAL: ${ZEITGHOST_INTERVAL:-3600} ZEITGHOST_INGEST_LIMIT: ${ZEITGHOST_INGEST_LIMIT:-50} diff --git a/tests/test_shard_integrity.py b/tests/test_shard_integrity.py index 79700e3..974c4d4 100644 --- a/tests/test_shard_integrity.py +++ b/tests/test_shard_integrity.py @@ -199,3 +199,28 @@ def test_resolve_signing_seed_malformed_is_none(monkeypatch): assert resolve_signing_seed() is None monkeypatch.setenv(SIGNING_KEY_NAME, "ab") # valid hex, but 1 byte ≠ 32 assert resolve_signing_seed() is None + + +# --- signing_required (prod fail-closed switch) ---------------------------- + +def test_signing_required_off_by_default(monkeypatch): + from zeitghost.shards import signing_required + monkeypatch.delenv("ZEITGHOST_REQUIRE_SIGNING", raising=False) + assert signing_required() is False + assert signing_required(flag=False) is False + + +def test_signing_required_flag_wins(monkeypatch): + from zeitghost.shards import signing_required + monkeypatch.delenv("ZEITGHOST_REQUIRE_SIGNING", raising=False) + assert signing_required(flag=True) is True + + +def test_signing_required_env_truthy_variants(monkeypatch): + from zeitghost.shards import signing_required + for truthy in ("1", "true", "TRUE", "yes", "on"): + monkeypatch.setenv("ZEITGHOST_REQUIRE_SIGNING", truthy) + assert signing_required() is True, truthy + for falsy in ("0", "false", "no", "", "off"): + monkeypatch.setenv("ZEITGHOST_REQUIRE_SIGNING", falsy) + assert signing_required() is False, falsy diff --git a/zeitghost/cli.py b/zeitghost/cli.py index ff0b857..85c2c60 100644 --- a/zeitghost/cli.py +++ b/zeitghost/cli.py @@ -36,16 +36,34 @@ def main(verbose: bool): help="Cap NewsAPI requests this run (default: remaining quota)") @click.option("--dry-run", is_flag=True, help="Fetch + analyze but skip writing shards") -def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): +@click.option("--require-signing", is_flag=True, + help="Fail if no valid signing key is configured, instead of " + "writing unsigned shards. Also enabled by " + "ZEITGHOST_REQUIRE_SIGNING=1 (prod sets this; local/CI leave " + "it off). See `zeitghost gen-signing-key`.") +def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool, + require_signing: bool): """Fetch new articles from NewsAPI, analyze with Claude, write shards.""" from zeitghost.fetcher import fetch_all, enrich_with_bodies from zeitghost.bias import analyze_batch from zeitghost.shards import (init_store, known_url_entities, is_known, article_to_internal_shard, article_to_sw_shard, build_lineage_index, resolve_signing_seed, + signing_required, SIGNING_KEY_NAME, SCOPE_INTERNAL, SCOPE_SW_ARTICLE) store = init_store() + + # Resolve the signing key up front so a "signing required but unconfigured" + # deploy fails fast — before spending any NewsAPI quota or Claude calls — + # rather than after fetching and analyzing a whole batch. + seed = resolve_signing_seed() + if seed is None and signing_required(require_signing): + raise click.ClickException( + f"Signing is required but no valid {SIGNING_KEY_NAME} is configured. " + f"Provision the key (see `zeitghost gen-signing-key`), or drop " + f"--require-signing / unset ZEITGHOST_REQUIRE_SIGNING for unsigned runs." + ) state_dir = Path(store.path).parent if hasattr(store, "path") else Path.home() / ".zeitghost" console.print(f"[bold]Fetching from {feeds}[/bold]") @@ -84,10 +102,8 @@ def ingest(feeds: str, limit: int, max_requests: int | None, dry_run: bool): # default, but possible if dedup is bypassed later) chain via parent_shard_id. internal_lineage = build_lineage_index(store, SCOPE_INTERNAL) sw_lineage = build_lineage_index(store, SCOPE_SW_ARTICLE) - # Sign shards with the Ed25519 provenance key when one is configured. - # Opt-in: unconfigured environments write unsigned shards (see - # resolve_signing_seed / `zeitghost gen-signing-key`). - seed = resolve_signing_seed() + # `seed` was resolved up front (for the fail-fast require check). Signing is + # opt-in: when it's None the shards are written unsigned. if analyzed: console.print(" Signing shards (ZEITGHOST_SIGNING_KEY configured)" if seed else " [dim]No signing key — writing unsigned shards[/dim]") diff --git a/zeitghost/shards.py b/zeitghost/shards.py index 1952e25..ff2e3d8 100644 --- a/zeitghost/shards.py +++ b/zeitghost/shards.py @@ -65,6 +65,21 @@ def resolve_signing_seed() -> bytes | None: return seed +def signing_required(flag: bool = False) -> bool: + """Whether ingest must fail-closed when no signing key is configured. + + True if the `--require-signing` flag is passed OR `ZEITGHOST_REQUIRE_SIGNING` + is truthy. Prod (us-ny1) sets the env var once its `ZEITGHOST_SIGNING_KEY` + is provisioned, so an accidentally-cleared key fails the run loudly instead + of silently writing unsigned shards. Local dev and CI leave both unset, so + signing stays opt-in there. + """ + if flag: + return True + val = os.environ.get("ZEITGHOST_REQUIRE_SIGNING", "").strip().lower() + return val in ("1", "true", "yes", "on") + + def _maybe_sign(shard: MemoryShard, signing_seed: bytes | None) -> None: """Sign `shard` in place when a seed is supplied (sets `signature` and `created_by`). The signature covers {atoms, scope, origin, …} but NOT the From a567caf53d5ff416679f0516fe515b73192e8334 Mon Sep 17 00:00:00 2001 From: Aaron Markham Date: Sat, 30 May 2026 21:11:48 -0700 Subject: [PATCH 4/4] deploy: pass ZEITGHOST_SIGNING_KEY secret through to env.j2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the shard-signing seed through the existing GitHub-secret → ansible extra-var → env.j2 chain (this repo has no ansible-vault file; vault_* vars are injected from repo secrets in deploy.yml). env.j2 already defaults the var to '' so the deploy renders before the ZEITGHOST_SIGNING_KEY repo secret is set. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/deploy.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 86b085b..33a12fb 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -125,12 +125,17 @@ jobs: ANSIBLE_HOST_KEY_CHECKING: "false" VAULT_ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} VAULT_NEWS_API_KEY: ${{ secrets.NEWS_API_KEY }} + # Ed25519 shard-signing seed (64 hex chars). Set the repo secret with + # `gh secret set ZEITGHOST_SIGNING_KEY`. Empty until then — env.j2 + # defaults it to '' so the deploy renders and ingest stays unsigned. + VAULT_ZEITGHOST_SIGNING_KEY: ${{ secrets.ZEITGHOST_SIGNING_KEY }} run: | TARGET="${{ inputs.target || 'us-ny1' }}" ansible-playbook deploy.yml \ -i "inventories/${TARGET}/hosts.yml" \ -e "vault_anthropic_api_key=${VAULT_ANTHROPIC_API_KEY}" \ -e "vault_news_api_key=${VAULT_NEWS_API_KEY}" \ + -e "vault_zeitghost_signing_key=${VAULT_ZEITGHOST_SIGNING_KEY}" \ -e "zeitghost_commit_sha=${{ github.sha }}" - name: Health check