From 2108fa3bf213cdfca6098360a6ecfa58caffc42c Mon Sep 17 00:00:00 2001
From: Tovli <Dekel@tovli.co.il>
Date: Mon, 15 Jun 2026 17:11:49 +0300
Subject: [PATCH 1/2] docs: add per-crate READMEs and bump workspace to 0.3.8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a README.md to every workspace crate (7 core crates + 5 adapters),
each documenting the crate's role, public API, a usage example, workspace
placement, status, and links to the ADRs/DDD contexts it realizes — matching
the style of apps/el-chat/README.md.

Bump the workspace version 0.3.7 -> 0.3.8 via `cargo set-version --workspace`
(docs-only change; el-chat inherits, the excluded el-grammar-llguidance is
unaffected).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                    | 24 ++---
 Cargo.toml                                    | 22 ++---
 crates/adapters/el-cloud/README.md            | 83 +++++++++++++++++
 crates/adapters/el-engine-candle/README.md    | 79 ++++++++++++++++
 crates/adapters/el-ffi/README.md              | 91 ++++++++++++++++++
 .../adapters/el-grammar-llguidance/README.md  | 70 ++++++++++++++
 .../adapters/el-provenance-ed25519/README.md  | 57 ++++++++++++
 crates/el-core/README.md                      | 80 ++++++++++++++++
 crates/el-grammar/README.md                   | 66 +++++++++++++
 crates/el-memory/README.md                    | 73 +++++++++++++++
 crates/el-provenance/README.md                | 63 +++++++++++++
 crates/el-runtime/README.md                   | 92 +++++++++++++++++++
 crates/el-safety/README.md                    | 64 +++++++++++++
 crates/el-telemetry/README.md                 | 61 ++++++++++++
 14 files changed, 902 insertions(+), 23 deletions(-)
 create mode 100644 crates/adapters/el-cloud/README.md
 create mode 100644 crates/adapters/el-engine-candle/README.md
 create mode 100644 crates/adapters/el-ffi/README.md
 create mode 100644 crates/adapters/el-grammar-llguidance/README.md
 create mode 100644 crates/adapters/el-provenance-ed25519/README.md
 create mode 100644 crates/el-core/README.md
 create mode 100644 crates/el-grammar/README.md
 create mode 100644 crates/el-memory/README.md
 create mode 100644 crates/el-provenance/README.md
 create mode 100644 crates/el-runtime/README.md
 create mode 100644 crates/el-safety/README.md
 create mode 100644 crates/el-telemetry/README.md

diff --git a/Cargo.lock b/Cargo.lock
index 193534e..385d488 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -793,7 +793,7 @@ checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
 
 [[package]]
 name = "el-chat"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
  "el-engine-candle",
@@ -801,7 +801,7 @@ dependencies = [
 
 [[package]]
 name = "el-cloud"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
  "reqwest",
@@ -812,11 +812,11 @@ dependencies = [
 
 [[package]]
 name = "el-core"
-version = "0.3.7"
+version = "0.3.8"
 
 [[package]]
 name = "el-engine-candle"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "candle-core",
  "candle-transformers",
@@ -828,7 +828,7 @@ dependencies = [
 
 [[package]]
 name = "el-ffi"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-cloud",
  "el-core",
@@ -842,7 +842,7 @@ dependencies = [
 
 [[package]]
 name = "el-grammar"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
  "el-provenance",
@@ -851,21 +851,21 @@ dependencies = [
 
 [[package]]
 name = "el-memory"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
 ]
 
 [[package]]
 name = "el-provenance"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
 ]
 
 [[package]]
 name = "el-provenance-ed25519"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "ed25519-dalek",
  "el-core",
@@ -874,7 +874,7 @@ dependencies = [
 
 [[package]]
 name = "el-runtime"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
  "el-memory",
@@ -884,14 +884,14 @@ dependencies = [
 
 [[package]]
 name = "el-safety"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
 ]
 
 [[package]]
 name = "el-telemetry"
-version = "0.3.7"
+version = "0.3.8"
 dependencies = [
  "el-core",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index d6902b4..a3f7f4e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,7 +35,7 @@ exclude = [
 
 [workspace.package]
 edition = "2021"
-version = "0.3.7"
+version = "0.3.8"
 license = "Apache-2.0"
 rust-version = "1.96"
 repository = "Edge-Native LLM SDK"
@@ -45,16 +45,16 @@ repository = "Edge-Native LLM SDK"
 # This is the single source of truth for internal dep version requirements;
 # member crates reference them with `{ workspace = true }` and add no inline
 # version fields. `cargo set-version --workspace <ver>` updates every entry.
-el-core               = { path = "crates/el-core", version = "0.3.7" }
-el-memory             = { path = "crates/el-memory", version = "0.3.7" }
-el-telemetry          = { path = "crates/el-telemetry", version = "0.3.7" }
-el-provenance         = { path = "crates/el-provenance", version = "0.3.7" }
-el-safety             = { path = "crates/el-safety", version = "0.3.7" }
-el-runtime            = { path = "crates/el-runtime", version = "0.3.7" }
-el-grammar            = { path = "crates/el-grammar", version = "0.3.7" }
-el-provenance-ed25519 = { path = "crates/adapters/el-provenance-ed25519", version = "0.3.7" }
-el-engine-candle      = { path = "crates/adapters/el-engine-candle", version = "0.3.7" }
-el-cloud              = { path = "crates/adapters/el-cloud", version = "0.3.7" }
+el-core               = { path = "crates/el-core", version = "0.3.8" }
+el-memory             = { path = "crates/el-memory", version = "0.3.8" }
+el-telemetry          = { path = "crates/el-telemetry", version = "0.3.8" }
+el-provenance         = { path = "crates/el-provenance", version = "0.3.8" }
+el-safety             = { path = "crates/el-safety", version = "0.3.8" }
+el-runtime            = { path = "crates/el-runtime", version = "0.3.8" }
+el-grammar            = { path = "crates/el-grammar", version = "0.3.8" }
+el-provenance-ed25519 = { path = "crates/adapters/el-provenance-ed25519", version = "0.3.8" }
+el-engine-candle      = { path = "crates/adapters/el-engine-candle", version = "0.3.8" }
+el-cloud              = { path = "crates/adapters/el-cloud", version = "0.3.8" }
 
 [profile.release]
 opt-level = 3
diff --git a/crates/adapters/el-cloud/README.md b/crates/adapters/el-cloud/README.md
new file mode 100644
index 0000000..82fd053
--- /dev/null
+++ b/crates/adapters/el-cloud/README.md
@@ -0,0 +1,83 @@
+# el-cloud — opt-in frontier LLM cloud backend
+
+The opt-in frontier cloud backend (ADR-010). It implements
+`el_core::LlmProvider` over [`reqwest`](https://crates.io/crates/reqwest) using
+the OpenAI Chat Completions API — which OpenAI, Anthropic (compat), Gemini
+(compat), Ollama, and any OpenAI-compatible endpoint all speak.
+
+## Air-gap is preserved (ADR-004)
+
+This is the project's **only** outbound network surface, and it is opt-in at the
+API level:
+
+- Outbound calls happen **only** when an app *explicitly constructs* a
+  `CloudProvider`. Apps that never build this type have zero outbound surface,
+  even though the crate compiles as part of the workspace.
+- The `reqwest` network dependency is gated to non-wasm32 targets (see the
+  wasm32 note below), so the web surface has no blocking HTTP transport at all.
+- Every consultation emits a content-free `DomainEvent::FrontierLlmConsulted`
+  (the `provider_hash` is a CRC32 of the provider prefix — never the API key or
+  any content).
+
+## What it provides
+
+- **`CloudProvider`** — owns a pooled `reqwest::blocking::Client` with explicit
+  timeouts (10 s connect, 60 s idle/per-read, 120 s total per non-streaming
+  request) so a stalled provider can never block an FFI caller indefinitely.
+  - `new()` / `Default`
+  - `with_event_sink(|event| …)` — register a callback for `DomainEvent`s (e.g.
+    to feed an [`el_telemetry::MetricsCollector`](../../el-telemetry)).
+  - implements `LlmProvider::chat` (blocking) and `chat_stream` (SSE).
+
+### Provider routing (by model prefix)
+
+| Model string | Base URL |
+|--------------|----------|
+| `openai/<model>` | `https://api.openai.com/v1` |
+| `anthropic/<model>` | `https://api.anthropic.com/v1` (compat) |
+| `gemini/<model>` | `https://generativelanguage.googleapis.com/v1beta/openai` |
+| `ollama/<model>` | `http://localhost:11434/v1` (no key required) |
+| `http(s)://…/<model>` | custom base URL |
+
+The streaming path parses Server-Sent Events strictly: provider error payloads
+and malformed chunks fail the call (a half-finished stream is never reported as
+a clean completion), and errors carry only parse category/position/size —
+**never** echoed content, per the el-core error contract.
+
+## Usage
+
+```rust
+use el_core::{ChatMessage, ChatRequest, CredentialRef, LlmProvider};
+use el_cloud::CloudProvider;
+
+let provider = CloudProvider::new();
+
+// The credential is resolved at runtime from the platform keystore — never embedded.
+let req = ChatRequest::new("openai/gpt-4o", vec![ChatMessage::user("Hello!")])
+    .with_credential(CredentialRef::new(api_key_from_keystore))
+    .with_max_tokens(256);
+
+let resp = provider.chat(&req)?;
+println!("{}", resp.content);
+# Ok::<(), el_core::EdgeError>(())
+```
+
+## wasm32 note
+
+`reqwest::blocking` is unavailable on `wasm32-unknown-unknown` (no threads), and
+the synchronous `LlmProvider::chat` cannot await the browser's `fetch`. The
+network modules are therefore gated to non-wasm32, and the web binding
+(`el-ffi`) exposes a throwing `cloud` constructor instead of silently degrading
+(ADR-010 amendment).
+
+## Status
+
+Implemented as an explicit egress adapter and a regular workspace member — the
+opt-in is enforced at construction, not by excluding it from the build. Uses
+`rustls-tls` so Android cross-compiles need no target OpenSSL sysroot.
+
+---
+
+Part of the [Edge Intelligence](../../../README.md) workspace. Realizes
+[ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md)
+while preserving [ADR-004](../../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md).
diff --git a/crates/adapters/el-engine-candle/README.md b/crates/adapters/el-engine-candle/README.md
new file mode 100644
index 0000000..08a12b9
--- /dev/null
+++ b/crates/adapters/el-engine-candle/README.md
@@ -0,0 +1,79 @@
+# el-engine-candle — Candle inference engine adapter
+
+The inference-engine adapter over [Candle](https://github.com/huggingface/candle)
+(ADR-002). It implements the runtime's `InferenceEngine` port (`RuntimeAcl`) and
+the `LlmProvider` trait (ADR-010), so the same `el_runtime::InferenceSession`
+decode loop drives everything — nothing in the SDK pipeline is bypassed.
+
+Float logits are quantised to integer milli-logits at the anti-corruption-layer
+boundary, so Candle's `Tensor`/`Device` types never cross into the domain. No
+`unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+This crate ships two engines and two providers, from "seam proof" to
+"real on-device chat":
+
+| Type | What it is |
+|------|------------|
+| `CandleEngine` | The **engine-seam proof**: one real Candle forward, `embed[last] · w_out` — a single linear projection. `toy()` builds deterministic synthetic weights; `from_path`/`from_bytes` load `token_embd.weight` + `output.weight` from a GGUF. Transformer blocks, attention, RoPE, and norms are *ignored* — logits won't match a real model. |
+| `LocalLlmProvider` | Wraps `CandleEngine` behind `LlmProvider` with a byte-level tokenizer. Good for exercising the binding layer end-to-end. |
+| `QwenEngine` | A **real Qwen2 transformer** `InferenceEngine` via `candle-transformers`, holding Candle's stateful KV cache. |
+| `QwenChatProvider` | A **real local chat backend**: a Qwen2 GGUF + its `tokenizer.json`, rendered to Qwen2.5 ChatML and driven through the standard provenance-gated session. This is what powers [`apps/el-chat`](../../../apps/el-chat). |
+
+### Expected GGUF tensor names (`CandleEngine`)
+
+- `token_embd.weight` — embedding table `[vocab, dim]`
+- `output.weight` or `lm_head.weight` — lm-head `[vocab, dim]` (standard Llama layout)
+
+Mismatched shapes are rejected at load time, not silently at inference.
+
+## Usage
+
+```rust
+use el_core::{ChatMessage, ChatRequest, LlmProvider};
+use el_engine_candle::QwenChatProvider;
+
+// Real on-device chat: a local Qwen2 GGUF + its tokenizer (no network egress).
+let provider = QwenChatProvider::from_paths(
+    "models/qwen2.5-0.5b-instruct-q4_k_m.gguf",
+    "models/qwen2.5-0.5b-instruct.tokenizer.json",
+)?;
+
+let req = ChatRequest::new("local", vec![ChatMessage::user("What is the capital of France?")])
+    .with_max_tokens(128);
+let reply = provider.chat(&req)?;
+println!("{}", reply.content);
+# Ok::<(), el_core::EdgeError>(())
+```
+
+Each `chat` call builds a fresh `QwenEngine` (Candle exposes no public KV-cache
+reset) and runs the standard SDK path: provenance permit → `load_prompt`
+(prefill) → `generate` (grammar mask → safety steer → greedy commit). Decoding
+is deterministic greedy argmax, so replies are reproducible.
+
+## Benchmark instrumentation
+
+Setting `EL_BENCH=1` makes `QwenChatProvider::chat` print a per-phase breakdown
+(model load / tokenize / prefill / decode / detokenize) plus per-forward
+attribution (model compute vs. seam quantisation vs. runtime loop) to stderr.
+It is zero-cost when unset and is a diagnostic only — not part of public behaviour.
+
+## Features & dependencies
+
+- `candle-core` + `candle-transformers` (0.8), and `tokenizers` 0.21 with the
+  **pure-Rust `fancy-regex`** backend (no C/C++ `onig`/`esaxx`, per ADR-008).
+- `metal` feature → enables `candle-core/metal` for Apple GPUs.
+
+## Status
+
+Implemented; runs real on-device chat. The `CandleEngine` linear projection is
+the ADR-002 engine-seam proof; `QwenEngine`/`QwenChatProvider` are the real
+transformer path.
+
+---
+
+Part of the [Edge Intelligence](../../../README.md) workspace. Realizes
+[ADR-002](../../../docs/adr/ADR-002-candle-as-rust-native-inference-engine.md)
+and [ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md);
+see the [Inference Runtime](../../../docs/ddd/bounded-contexts/01-inference-runtime.md) context.
diff --git a/crates/adapters/el-ffi/README.md b/crates/adapters/el-ffi/README.md
new file mode 100644
index 0000000..1172d87
--- /dev/null
+++ b/crates/adapters/el-ffi/README.md
@@ -0,0 +1,91 @@
+# el-ffi — host bindings (React Native, Flutter, Web)
+
+One Rust API surface exported three ways, so mobile and web apps can call the
+SDK in their native idiom (ADR-001, ADR-009, ADR-010):
+
+| Surface | Tool | Output |
+|---------|------|--------|
+| **React Native** | `uniffi-bindgen-react-native` | TypeScript + JSI C++ + Turbo Module |
+| **Flutter** | `flutter_rust_bridge` v2 | Dart opaque handle, `Future`/`Stream` |
+| **Web / npm** | `wasm-bindgen` | ESM TypeScript package via `wasm-pack` |
+
+No `unsafe` (`#![forbid(unsafe_code)]`). The crate is `cdylib` + `staticlib` +
+`lib` so each toolchain can link the form it needs.
+
+## What it provides
+
+- **`EdgeLlm`** — the flat, FFI-friendly facade, annotated for all three
+  surfaces at once:
+  - `EdgeLlm::local(model_uri)` — local Candle engine, air-gapped (ADR-002/004).
+    An empty `model_uri` uses a deterministic toy model for development;
+    a path loads a consumer-supplied GGUF.
+  - `EdgeLlm::cloud(model, api_key)` — frontier cloud backend (opt-in, ADR-010).
+    **Native only** — see the web limitation below. `api_key` must come from the
+    platform keystore, never embedded.
+  - `ask(prompt) -> Result<String, SdkError>` — blocking chat.
+  - `ask_stream(prompt, |token| …)` — closure streaming (Flutter / FRB v2).
+  - `ask_stream_cb(prompt, handler)` — `StreamHandler` callback streaming
+    (React Native; UniFFI cannot export `impl FnMut`).
+  - `reset()`.
+- **`SdkError`** — a thin, FFI-safe projection of `el_core::EdgeError`
+  (`el-core`'s `Box<str>`/Rust-specific variants are not FFI-safe). Projects to
+  the host language's exception type, or a JS exception on wasm.
+- **`StreamHandler`** — the React Native streaming callback interface.
+
+## Usage (Rust side)
+
+```rust
+use el_ffi::EdgeLlm;
+
+// Empty path → deterministic toy model; exercises the full binding layer.
+let sdk = EdgeLlm::local(String::new())?;
+let reply = sdk.ask("hello".into())?;
+assert!(!reply.is_empty());
+
+// Streaming (Flutter / closure form):
+sdk.ask_stream("hi".into(), |fragment| print!("{fragment}"))?;
+# Ok::<(), el_ffi::SdkError>(())
+```
+
+## Building the bindings
+
+The Rust binding *surfaces* compile on the host; the cross-target builds and
+codegen run via the [`Makefile`](../../../Makefile):
+
+```sh
+make build-android    # cargo build --target aarch64-linux-android  (shared lib)
+make build-ios        # cargo build --target aarch64-apple-ios       (static lib)
+make build-wasm       # wasm-pack build → out/web ESM package
+
+make codegen-rn       # React Native JSI bindings (needs build-android)
+make codegen-flutter  # flutter_rust_bridge v2 Dart bindings
+make bindings         # all three surfaces
+```
+
+Prerequisites (rustup targets, NDK linker, `wasm-pack`,
+`uniffi-bindgen-react-native`, `flutter_rust_bridge_codegen`) are documented in
+the Makefile header.
+
+## Web limitations
+
+On `wasm32` the local path currently uses a dev-stage echo placeholder until
+Candle-on-wasm is wired, and the **cloud backend is not available on web**
+(ADR-010 amendment): `el-cloud`'s blocking HTTP transport has no wasm
+implementation, so `EdgeLlm.cloud` throws an explicit error there instead of
+silently degrading. Use a native binding (React Native / Flutter) for cloud
+access.
+
+## Status
+
+Implemented and tested (native + `wasm32` compile). As a workspace member, the
+host-target Rust surfaces build and test with the rest of the workspace
+(`cargo test --workspace`); the Android / iOS / wasm cross-builds and binding
+codegen run separately via the Makefile because those toolchains are installed
+out-of-band.
+
+---
+
+Part of the [Edge Intelligence](../../../README.md) workspace. Realizes
+[ADR-001](../../../docs/adr/ADR-001-adopt-webassembly-as-cross-platform-sdk-runtime.md),
+[ADR-009](../../../docs/adr/ADR-009-flutter-rust-bridge-for-dart-bindings.md),
+and [ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md).
diff --git a/crates/adapters/el-grammar-llguidance/README.md b/crates/adapters/el-grammar-llguidance/README.md
new file mode 100644
index 0000000..b280cc6
--- /dev/null
+++ b/crates/adapters/el-grammar-llguidance/README.md
@@ -0,0 +1,70 @@
+# el-grammar-llguidance — JSON-schema grammar masking over llguidance
+
+The production grammar masker (ADR-004): real **JSON-schema** token masking over
+[llguidance](https://github.com/guidance-ai/llguidance), bridged to a HuggingFace
+tokenizer. It is the scale-up of the pure-Rust, regular-grammar
+[`el-grammar`](../../el-grammar) — both implement the same
+`el_runtime::GrammarMasker` port, so the runtime is agnostic to which is wired.
+
+The grammar FSM is built once from a JSON schema at construction time and
+advanced per committed token during generation (~50 µs/mask for a 128k vocab).
+The HuggingFace `tokenizers::Tokenizer` is bridged to llguidance's `TokEnv` via
+`toktrie_hf_tokenizers` (the official guidance-ai integration crate). No
+`unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+- **`LlguidanceMasker`** — implements `el_runtime::GrammarMasker`:
+  - `from_tokenizer(&tokenizer, schema_json)` — build a masker from a HF
+    tokenizer and a JSON-schema string.
+  - `mask(recent, vocab)` — advances the FSM over newly committed tokens and
+    returns the allowed-token mask. Once the grammar is satisfied/exhausted (or
+    a token violates it and the parser cannot recover) it falls back to
+    allow-all from there on.
+  - `reset()` — rebuild a fresh parser for the same schema.
+
+## Usage
+
+```rust
+use el_grammar_llguidance::LlguidanceMasker;
+use el_runtime::GrammarMasker;
+
+let tokenizer = tokenizers::Tokenizer::from_file("tokenizer.json").unwrap();
+let masker = LlguidanceMasker::from_tokenizer(&tokenizer, r#"{"type":"integer"}"#)?;
+
+// In the decode loop, wired into the session's Ports:
+let allow_mask = masker.mask(&committed_tokens, vocab_size);
+# Ok::<(), el_core::EdgeError>(())
+```
+
+## Building (workspace-excluded)
+
+This crate depends on `llguidance` + `toktrie_hf_tokenizers` (which require
+crates.io) and a native tokenizer build, so it is **excluded from the offline
+workspace** and declares its own empty `[workspace]` table to build standalone:
+
+```sh
+cargo build  --manifest-path crates/adapters/el-grammar-llguidance/Cargo.toml
+cargo test   --manifest-path crates/adapters/el-grammar-llguidance/Cargo.toml
+```
+
+### API-version note
+
+The call sites target the llguidance / toktrie **1.7** line (llguidance 1.7 +
+toktrie_hf_tokenizers 1.7 + tokenizers 0.21 — they must share one `toktrie`). If
+a build fails with an API mismatch on `ByteTokenizer`, `ParserFactory`, or
+`TokenParser`, the relevant call sites are marked with `// llg-api:` comments.
+
+> Note: this crate uses the `onig` tokenizer backend (unlike `el-engine-candle`,
+> which uses the pure-Rust `fancy-regex` backend per ADR-008).
+
+## Status
+
+Implemented and tested (integer-schema masking constrains digit tokens;
+allow-all fallback on exhaustion). Lark/CFG grammars are tracked follow-up work.
+
+---
+
+Part of the [Edge Intelligence](../../../README.md) workspace (excluded build).
+Realizes [ADR-004](../../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md);
+see the [Grammar Constraint](../../../docs/ddd/bounded-contexts/04-grammar-constraint.md) context.
diff --git a/crates/adapters/el-provenance-ed25519/README.md b/crates/adapters/el-provenance-ed25519/README.md
new file mode 100644
index 0000000..489f2d8
--- /dev/null
+++ b/crates/adapters/el-provenance-ed25519/README.md
@@ -0,0 +1,57 @@
+# el-provenance-ed25519 — real ED25519 signature verifier
+
+The production `SignatureVerifier` for the model-provenance load gate (ADR-006),
+implemented over [`ed25519-dalek`](https://crates.io/crates/ed25519-dalek) v2.
+
+It plugs into the gate *logic* in [`el-provenance`](../../el-provenance): this
+adapter answers "is this signature valid?", and `el-provenance` decides what to
+do about it (issue a `LoadPermit`, or hard-stop). Pure-Rust dependency tree, no
+`unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+- **`Ed25519Verifier`** — verifies model signatures against a set of trusted
+  provider public keys, keyed by `public_key_id` (the trust-anchor reference
+  from ADR-006):
+  - `new()` / `Default`
+  - `register(id, key_bytes: [u8; 32])` — register a trusted public key
+  - implements `el_provenance::SignatureVerifier::verify`, which **rejects**
+    (returns `false`) on an unknown key id, a malformed signature, or a
+    verification failure — every failure mode is a hard stop upstream.
+
+## Usage
+
+```rust
+use el_core::{ModelFormat, ModelId, ModelVersion};
+use el_provenance::ModelArtifact;
+use el_provenance_ed25519::Ed25519Verifier;
+
+let mut verifier = Ed25519Verifier::new();
+verifier.register(/* public_key_id */ 1, trusted_public_key_bytes)?;
+
+let mut artifact = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf);
+artifact.verify(&verifier, model_bytes, signature_bytes, 1);
+
+// Verified → a LoadPermit; tampered bytes, a forged signature, or an unknown
+// key id → a hard error with no fallback.
+let permit = artifact.ensure_loadable()?;
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+## Why it's a separate adapter
+
+The core gate lives in `el-provenance` with **zero dependencies** so it
+cross-compiles everywhere and stays unit-testable offline with doubles. This
+adapter isolates the one crates.io dependency (`ed25519-dalek`) behind the
+`SignatureVerifier` seam. It is a regular workspace member.
+
+## Status
+
+Implemented and tested — genuine signatures verify; tampering, forged
+signatures, malformed signatures, and unknown key ids are all rejected.
+
+---
+
+Part of the [Edge Intelligence](../../../README.md) workspace. Realizes
+[ADR-006](../../../docs/adr/ADR-006-mandatory-ed25519-model-signature-verification-load-gate.md);
+see the [Model Provenance](../../../docs/ddd/bounded-contexts/08-model-provenance.md) context.
diff --git a/crates/el-core/README.md b/crates/el-core/README.md
new file mode 100644
index 0000000..67cf956
--- /dev/null
+++ b/crates/el-core/README.md
@@ -0,0 +1,80 @@
+# el-core — shared domain vocabulary
+
+The foundational crate of the Edge Intelligence SDK: the *ubiquitous language*
+of the project turned into Rust types. Every other crate speaks in terms of the
+ids, value objects, errors, events, and the provider trait defined here.
+
+`el-core` has **zero external dependencies** — pure `std`, so it compiles
+offline on any target including `wasm32` (ADR-008). It contains no I/O, no
+network, and no `unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+| Module | Key types | Purpose |
+|--------|-----------|---------|
+| `ids` | `SessionId`, `ModelId`, `ModelVersion` | Identifier value objects |
+| `value_objects` | `Token`, `ModelFormat`, `RuntimeKind`, `DeviceTarget`, `Phase`, `SafetyMode`, `SpeculationMode`, `StopReason` | Core enums and the `Token = u32` alias |
+| `config` | `SessionConfig` | Immutable per-session configuration |
+| `error` | `EdgeError`, `Result<T>` | The SDK-wide error type |
+| `events` | `DomainEvent`, `EventEnvelope`, `DegradeReason` | Content-free domain events |
+| `provider` | `LlmProvider`, `ChatRequest`, `ChatResponse`, `ChatMessage`, `ChatRole`, `ChatToken`, `CredentialRef` | The unified backend abstraction |
+
+All names are re-exported at the crate root, e.g. `use el_core::{LlmProvider, ChatRequest};`.
+
+## Cross-cutting invariants encoded here
+
+These are enforced by the type system, not by convention:
+
+- **Content-free events (ADR-007).** `DomainEvent` and `EventEnvelope` derive
+  `Copy`. A `String`/`Vec`/heap field is not `Copy`, so adding one fails to
+  compile — "no prompt or response content on an event" is a *compile-time*
+  guarantee. Ratios and scores are carried as fixed-point integers (`*_milli`).
+- **Air-gap by default (ADR-004).** `SessionConfig::default().hybrid_mode` is
+  `false`. The only network seam is an explicit opt-in.
+- **Unified provider (ADR-010).** `LlmProvider` covers both the local Candle
+  engine and cloud frontier backends behind one trait, so host apps can swap
+  local ↔ frontier without touching their UI.
+- **Redacted credentials.** `CredentialRef`'s `Debug` output is
+  `CredentialRef([REDACTED])`, so bearer keys cannot leak into logs or panic
+  messages.
+
+## Usage
+
+```rust
+use el_core::{ChatMessage, ChatRequest, LlmProvider, SessionConfig};
+
+// Configuration is air-gapped by default (ADR-004).
+let cfg = SessionConfig::default();
+assert!(!cfg.hybrid_mode);
+
+// A backend-agnostic request. `model` is a routing hint:
+//   "local"/"" → local engine, "openai/…", "anthropic/…", "ollama/…", "gemini/…"
+let req = ChatRequest::new("local", vec![ChatMessage::user("Hello!")])
+    .with_max_tokens(256)
+    .with_temperature(700); // 700 = 0.7 (milli to keep the type Eq-able)
+
+// Any backend is reached through the same trait.
+fn run(provider: &dyn LlmProvider, req: &ChatRequest) -> el_core::Result<String> {
+    Ok(provider.chat(req)?.content)
+}
+```
+
+## Place in the workspace
+
+`el-core` is the root of the dependency graph: `el-memory`, `el-telemetry`,
+`el-provenance`, `el-safety`, `el-runtime`, and every adapter depend on it, and
+it depends on nothing. Keep it dependency-free — that property is what lets the
+local core cross-compile to WASM and mobile targets.
+
+## Status
+
+Implemented and tested.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md),
+[ADR-007](../../docs/adr/ADR-007-content-free-domain-events-privacy-by-construction-telemetry.md),
+[ADR-008](../../docs/adr/ADR-008-implement-the-sdk-in-rust-instead-of-c-cpp.md),
+and [ADR-010](../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md).
+The vocabulary mirrors [`docs/ddd/ubiquitous-language.md`](../../docs/ddd/ubiquitous-language.md).
diff --git a/crates/el-grammar/README.md b/crates/el-grammar/README.md
new file mode 100644
index 0000000..377f2d2
--- /dev/null
+++ b/crates/el-grammar/README.md
@@ -0,0 +1,66 @@
+# el-grammar — pure-Rust DFA token masking
+
+Grammar-constrained decoding via **DFA token masking**, in pure Rust (the
+Grammar Constraint context). A grammar is compiled to a deterministic automaton
+over **token ids** (the alphabet). Each decode step, the masker replays the
+committed tokens to find the current state and allows only tokens with a valid
+transition — the exact token-level masking mechanism that XGrammar/llguidance
+implement, here for *regular* grammars with no external dependencies.
+
+Depends on `el-core` and `el-runtime`. No `unsafe` (`#![forbid(unsafe_code)]`).
+
+## Scope
+
+This crate covers **regular grammars over an explicit token alphabet**. Full
+context-free / JSON-schema grammars with a real tokenizer environment are the
+production scale-up in
+[`el-grammar-llguidance`](../adapters/el-grammar-llguidance). Both implement the
+same `el_runtime::GrammarMasker` port, so the runtime is agnostic to which one
+is wired.
+
+## What it provides
+
+- **`Dfa`** — a deterministic finite automaton over `Token` ids, with a builder
+  API: `Dfa::new(start).transition(from, token, to).accept(state)`. Plus
+  `step`, `run` (replay a token sequence), and `is_accepting`.
+- **`DfaMasker`** — wraps a `Dfa` and implements `el_runtime::GrammarMasker`.
+  `mask(recent, vocab)` returns a per-token allow mask; a dead (invalid) state
+  yields an all-`false` mask (nothing legal). `accepts(committed)` reports
+  whether stopping now would be valid.
+- **`StateId`** — `u32` alias for automaton states.
+
+## Usage
+
+```rust
+use el_grammar::{Dfa, DfaMasker};
+use el_runtime::GrammarMasker;
+
+// A grammar that accepts exactly the token sequence 5, 5, 9.
+let dfa = Dfa::new(0)
+    .transition(0, 5, 1)
+    .transition(1, 5, 2)
+    .transition(2, 9, 3)
+    .accept(3);
+let masker = DfaMasker::new(dfa);
+
+// At the start, only token 5 is legal.
+let mask = masker.mask(&[], 10);
+assert!(mask[5] && !mask[9]);
+
+// Wired into a session's Ports, this forces grammar-valid output even when the
+// engine's raw logits would prefer something else.
+assert!(masker.accepts(&[5, 5, 9]));
+```
+
+Drop it into a session via `Ports { grammar: Box::new(masker), ..Ports::permissive() }`.
+
+## Status
+
+Implemented and tested, including an end-to-end test that constrains real
+decoding inside `el_runtime::InferenceSession`.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace; see the
+[Grammar Constraint](../../docs/ddd/bounded-contexts/04-grammar-constraint.md)
+context and [ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md).
diff --git a/crates/el-memory/README.md b/crates/el-memory/README.md
new file mode 100644
index 0000000..d2ea895
--- /dev/null
+++ b/crates/el-memory/README.md
@@ -0,0 +1,73 @@
+# el-memory — static memory planning
+
+Ahead-of-time memory planning, a single contiguous arena allocated once, and
+descriptor-only KV-cache compaction (ADR-003). The discipline — reproduced from
+ExecuTorch's technique, in pure Rust — is to assign every tensor a fixed offset
+*before* inference so the decode loop performs **no heap allocation**, and to
+resolve KV pruning by shuffling descriptors rather than copying data.
+
+Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+- **`StaticMemoryPlanner::plan(tensors, budget_bytes)`** — interval-colours
+  tensor lifetimes per memory tier so that tensors whose lifetimes do **not**
+  overlap reuse the same offset. Returns a `MemoryPlan`, or
+  `EdgeError::MemoryBudgetExceeded` when the packed plan does not fit the budget
+  (the signal the runtime uses to disable optional features or spill).
+- **`MemoryPlan`** — the immutable allocation: `placements()`, `placement(id)`,
+  `sram_bytes()`, `dram_bytes()`, `total_bytes()`.
+- **`TensorSpec`** / **`BufferLifetime`** / **`MemoryTier`** (`Sram` | `Dram`) /
+  **`Placement`** / **`TensorId`** / **`TensorOffset`** — the planner's inputs
+  and outputs.
+- **`Arena`** — a contiguous byte buffer allocated once at session init; the
+  decode loop borrows planned sub-slices via `region()` / `region_mut()` and
+  never allocates again.
+- **`KvRegion`** / **`KvSlot`** — the KV cache addressed through descriptors.
+  `compact()` removes pruned descriptors and re-indexes survivors **without
+  moving payload bytes** — survivors keep their original `offset`.
+
+## Usage
+
+```rust
+use el_memory::{Arena, BufferLifetime, KvRegion, MemoryTier, StaticMemoryPlanner, TensorSpec};
+
+// Two same-tier tensors with disjoint lifetimes share one offset, so the tier
+// needs max(size), not the sum.
+let tensors = [
+    TensorSpec { id: 1, size: 100, tier: MemoryTier::Dram, lifetime: BufferLifetime::new(0, 2) },
+    TensorSpec { id: 2, size: 100, tier: MemoryTier::Dram, lifetime: BufferLifetime::new(3, 5) },
+];
+let plan = StaticMemoryPlanner::plan(&tensors, 1024 * 1024)?;
+assert_eq!(plan.dram_bytes(), 100); // reused, not 200
+
+// Allocate the arena once; the decode loop borrows planned regions.
+let mut arena = Arena::new(plan.total_bytes() as usize);
+if let Some(region) = arena.region_mut(0, 100) { /* fill weights */ }
+
+// KV compaction shuffles descriptors only — no data copy.
+let mut kv = KvRegion::new();
+kv.push(0);   // token 0 @ offset 0
+kv.push(64);  // token 1 @ offset 64
+kv.mark_pruned(0);
+let reclaimed = kv.compact(); // 1; the survivor keeps offset 64
+# Ok::<(), el_core::EdgeError>(())
+```
+
+## Place in the workspace
+
+Used by `el-runtime` (the session owns a `KvRegion`). The planner's
+`MemoryBudgetExceeded` error and the `MemoryPlanCreated`/`KvCacheCompacted`
+domain events tie this crate to the Telemetry & Privacy context.
+
+## Status
+
+Implemented and tested. The `Arena` models the allocate-once + fixed-offset
+contract in safe Rust; true OS page-alignment and DMA wiring are a platform
+concern handled when a real engine path is attached.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-003](../../docs/adr/ADR-003-static-memory-planning-with-zero-allocation-arena.md);
+see the [Memory Management](../../docs/ddd/bounded-contexts/06-memory-management.md) context.
diff --git a/crates/el-provenance/README.md b/crates/el-provenance/README.md
new file mode 100644
index 0000000..25c8fa8
--- /dev/null
+++ b/crates/el-provenance/README.md
@@ -0,0 +1,63 @@
+# el-provenance — model-signature load gate
+
+The hard model-signature load gate (ADR-006). This crate owns the *decision
+logic*: a `ModelArtifact` must reach `Verified` before a `LoadPermit` is issued,
+and a missing or failing signature is a **hard stop with no fallback**.
+
+The actual ED25519 maths is abstracted behind the `SignatureVerifier` trait so
+the gate is testable offline. The real `ed25519-dalek` implementation lives in
+the [`el-provenance-ed25519`](../adapters/el-provenance-ed25519) adapter.
+
+Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+- **`SignatureVerifier`** — the abstracted primitive:
+  `verify(bytes, signature, public_key_id) -> bool`. Implemented for real by the
+  ed25519 adapter and by test doubles.
+- **`ModelArtifact`** — a model file plus its provenance metadata
+  (`id`, `version`, `format`, `status`). `verify(...)` transitions the status
+  *before* any load/mmap; `ensure_loadable()` is the gate itself.
+- **`LoadPermit`** — the capability token proving an artifact passed the gate.
+  It **cannot be constructed except via `ensure_loadable()`**, so a model that
+  has not been verified cannot reach the runtime — `el_runtime::InferenceSession`
+  requires a `LoadPermit` to be built (the Conformist relationship, enforced in
+  the type system).
+- **`VerificationStatus`** — `Unverified` | `Verified` | `Rejected`.
+
+## Usage
+
+```rust
+use el_core::{ModelFormat, ModelId, ModelVersion};
+use el_provenance::{ModelArtifact, SignatureVerifier};
+
+// Plug in a real verifier (el-provenance-ed25519) or a test double.
+struct AlwaysOk;
+impl SignatureVerifier for AlwaysOk {
+    fn verify(&self, _bytes: &[u8], _sig: &[u8], _key_id: u32) -> bool { true }
+}
+
+let mut artifact = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf);
+artifact.verify(&AlwaysOk, b"<model-bytes>", b"<signature>", /* public_key_id */ 7);
+
+// No verified signature → no permit → no session.
+let permit = artifact.ensure_loadable()?; // Err(UnverifiedModel | SignatureRejected) otherwise
+# Ok::<(), el_core::EdgeError>(())
+```
+
+## Place in the workspace
+
+`el-runtime` accepts a `LoadPermit` (not raw bytes) to construct a session, so
+this gate sits in front of every inference path. The ed25519 adapter provides
+the production `SignatureVerifier`.
+
+## Status
+
+Implemented and tested. The gate *logic* is fully covered here; real signature
+verification is provided by the ed25519 adapter.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-006](../../docs/adr/ADR-006-mandatory-ed25519-model-signature-verification-load-gate.md);
+see the [Model Provenance](../../docs/ddd/bounded-contexts/08-model-provenance.md) context.
diff --git a/crates/el-runtime/README.md b/crates/el-runtime/README.md
new file mode 100644
index 0000000..727eb12
--- /dev/null
+++ b/crates/el-runtime/README.md
@@ -0,0 +1,92 @@
+# el-runtime — session lifecycle & decode-loop orchestration
+
+The Core of the SDK: the inference session state machine, the port traits that
+collaborator contexts plug into, and the decode-loop orchestrator (ADR-001).
+
+Air-gap is **structural** here (ADR-004): this crate has no network dependency,
+and the only outbound seam is the opt-in `HybridRelay` port. No `unsafe`
+(`#![forbid(unsafe_code)]`).
+
+## The decode-step invariant
+
+Every decode step composes its collaborators in a fixed, invariant order:
+
+```
+grammar mask  →  safety adjust  →  sample (greedy argmax)  →  commit KV
+```
+
+Grammar runs *before* safety, so safety steering only ever operates over
+already-legal tokens. This ordering is enforced in `InferenceSession::generate`
+and covered by tests.
+
+## What it provides
+
+- **`InferenceSession<E: InferenceEngine>`** — the aggregate root. Constructing
+  it **requires a `LoadPermit`** (`el-provenance`), so an unverified model cannot
+  reach the runtime. Drives the `Initialized → Prefilling → Decoding → Completed`
+  phases via `load_prompt()`, `generate()`, `reset()`, and emits content-free
+  `EventEnvelope`s (`drain_events()`).
+- **Port traits** (the collaborator seams):
+  - `InferenceEngine` — `prefill`, `next_logits` (integer milli-logits), `eos_token`.
+  - `PromptCompressor` — optional LLMLingua-2-style compression.
+  - `GrammarMasker` — per-token allow mask.
+  - `HybridRelay` — opt-in **LAN-only** relay; there is no cloud variant.
+- **`Ports`** — the collaborator bundle bound to a session; `Ports::permissive()`
+  gives identity compression, allow-all grammar, no safety, and no relay.
+- **Defaults** — `IdentityCompressor`, `AllowAllMasker`, and `NullEngine` (emits
+  EOS right after prefill) let you exercise the full lifecycle without any
+  external adapter.
+- **Re-exports** `el_safety::{SafetySteerer, LogitAdjustment}` so callers wire
+  one type system.
+
+`consult_relay()` hard-fails with `EdgeError::AirGapViolation` unless
+`hybrid_mode` is enabled **and** a relay is wired (ADR-004).
+
+## Usage
+
+```rust
+use el_core::{SessionConfig, SessionId, StopReason};
+use el_provenance::{ModelArtifact, SignatureVerifier};
+use el_core::{ModelFormat, ModelId, ModelVersion};
+use el_runtime::{InferenceSession, NullEngine, Ports};
+
+// A LoadPermit is required to build a session (ADR-006).
+struct Ok_; impl SignatureVerifier for Ok_ { fn verify(&self, _: &[u8], _: &[u8], _: u32) -> bool { true } }
+let mut art = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf);
+art.verify(&Ok_, b"w", b"s", 1);
+let permit = art.ensure_loadable()?;
+
+let mut session = InferenceSession::new(
+    SessionId(1),
+    SessionConfig::default(),
+    NullEngine::new(/* eos */ 3, /* vocab */ 8),
+    permit,
+);
+
+let ports = Ports::permissive();
+session.load_prompt(&ports, &[10, 11, 12])?;          // compress → prefill → KV
+let stop = session.generate(&ports, 16)?;             // runs the decode loop
+assert_eq!(stop, StopReason::Eos);
+# Ok::<(), el_core::EdgeError>(())
+```
+
+A real engine plugs into `InferenceEngine` (see
+[`el-engine-candle`](../adapters/el-engine-candle)); a real grammar plugs into
+`GrammarMasker` (see [`el-grammar`](../el-grammar) and
+[`el-grammar-llguidance`](../adapters/el-grammar-llguidance)).
+
+## Place in the workspace
+
+Depends on `el-core`, `el-memory` (owns a `KvRegion`), `el-safety`, and
+`el-provenance`. It is the hub every adapter wires into.
+
+## Status
+
+Implemented and tested.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-001](../../docs/adr/ADR-001-adopt-webassembly-as-cross-platform-sdk-runtime.md)
+and [ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md);
+the decode order is specified in [`docs/ddd/domain-events.md`](../../docs/ddd/domain-events.md).
diff --git a/crates/el-safety/README.md b/crates/el-safety/README.md
new file mode 100644
index 0000000..8790b7e
--- /dev/null
+++ b/crates/el-safety/README.md
@@ -0,0 +1,64 @@
+# el-safety — on-device decoder-time safety
+
+On-device, tiered, decoder-time safety (ADR-005). Safety is applied as a
+per-step logit adjustment during decoding — **after** the grammar mask and
+**before** sampling — so it only ever steers over already-legal tokens. **No
+safety path touches the network.**
+
+Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`).
+
+## What it provides
+
+- **`SafetySteerer`** — the per-step intervention trait: `adjust(recent_tokens)
+  -> LogitAdjustment` and `mode()`. The runtime calls this each decode step.
+- **`LogitAdjustment`** — a sparse, integer (milli-logit) vector subtracted from
+  target logits. Sparse + integer keeps steering deterministic and
+  allocation-light. `delta_for(token)`, `l1_norm_milli()` (what the
+  `LogitsSteered` telemetry event reports), `is_empty()`.
+- **`SafetyModeSelector::resolve(requested, device)`** — budget-gates the tier
+  by device profile: `SecDecoding` (two ~1B models) is downgraded to
+  `Lightweight` on a `MidRange` device.
+- **Steerers per `SafetyMode`:**
+  - `NoSafety` (`Off`) — a no-op.
+  - `LightweightFilter` (`Lightweight`) — a training-free blacklist filter
+    (**fully implemented**). Banned tokens receive `HARD_BAN = -1_000_000`
+    milli-logits so they cannot be sampled.
+  - `SecDecodingSteerer` (`SecDecoding`) — base-vs-safety-model steering.
+    **Scaffolded** follow-up: it requires two ~1B models on Candle, so until the
+    assets are wired it returns no adjustment while honestly reporting its mode
+    (so callers can select it without it silently mis-steering).
+
+## Usage
+
+```rust
+use el_core::{DeviceTarget, SafetyMode};
+use el_safety::{LightweightFilter, SafetyModeSelector, SafetySteerer};
+
+// On a mid-range device, SecDecoding is downgraded to a tier it can afford.
+let mode = SafetyModeSelector::resolve(SafetyMode::SecDecoding, DeviceTarget::MidRange);
+assert_eq!(mode, SafetyMode::Lightweight);
+
+// Lightweight bans specific token ids outright.
+let filter = LightweightFilter::new(vec![42, 99]);
+let adj = filter.adjust(&[]);
+assert_eq!(adj.delta_for(42), LightweightFilter::HARD_BAN);
+assert_eq!(adj.delta_for(7), 0);
+```
+
+## Place in the workspace
+
+Re-exported by `el-runtime` (`el_runtime::{SafetySteerer, LogitAdjustment}`) so
+callers wire a single type system. The session applies the chosen steerer in the
+invariant decode order `grammar mask → safety adjust → sample → commit`.
+
+## Status
+
+Partial by design: the `Lightweight` blacklist path is real and tested;
+`SecDecoding`/`Csd` model-backed steering is a tracked follow-up that needs
+model assets.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-005](../../docs/adr/ADR-005-on-device-only-tiered-decoder-time-safety.md);
+see the [Safety](../../docs/ddd/bounded-contexts/05-safety.md) context.
diff --git a/crates/el-telemetry/README.md b/crates/el-telemetry/README.md
new file mode 100644
index 0000000..e8b7723
--- /dev/null
+++ b/crates/el-telemetry/README.md
@@ -0,0 +1,61 @@
+# el-telemetry — content-free metrics collector
+
+A one-way, downstream subscriber that folds content-free
+[`el_core::DomainEvent`](../el-core)s into performance snapshots (ADR-007).
+
+It depends on `el-core` and **nothing depends on it**, and it has no network
+channel of its own. Because it can only ever read the numeric and enum fields of
+events that are *already* content-free by construction, "no user content in
+telemetry" is structural — there is no code path by which a prompt or response
+could reach a metric.
+
+## What it provides
+
+- **`MetricsCollector`** — subscribes to the domain-event stream and maintains a
+  running snapshot. Call `observe(&envelope)` per event and `snapshot()` to read.
+- **`TelemetrySnapshot`** — a `Copy` struct of counters and gauges:
+  `prefill_tps`, `decode_tps`, `ttft_ms`, `peak_bytes`, `tokens_generated`,
+  `compressions`, `safety_violations`.
+
+`peak_bytes` is a monotonic high-water mark — it only ever rises across the
+events it observes.
+
+## Usage
+
+```rust
+use el_core::{DomainEvent, EventEnvelope, SessionId};
+use el_telemetry::MetricsCollector;
+
+let mut collector = MetricsCollector::new();
+
+collector.observe(&EventEnvelope::new(
+    SessionId(1),
+    0,
+    DomainEvent::PrefillCompleted { prompt_tokens: 100, kv_len: 100, prefill_tps: 480 },
+));
+collector.observe(&EventEnvelope::new(
+    SessionId(1),
+    1,
+    DomainEvent::TokenCommitted { kv_len: 101 },
+));
+
+let snap = collector.snapshot();
+assert_eq!(snap.prefill_tps, 480);
+assert_eq!(snap.tokens_generated, 1);
+```
+
+## Place in the workspace
+
+In a full build, the runtime drains `EventEnvelope`s from an `InferenceSession`
+(or a cloud `CloudProvider`'s event sink) and feeds them here. This crate is the
+read-only end of that pipeline.
+
+## Status
+
+Implemented and tested.
+
+---
+
+Part of the [Edge Intelligence](../../README.md) workspace. Realizes
+[ADR-007](../../docs/adr/ADR-007-content-free-domain-events-privacy-by-construction-telemetry.md);
+see the [Telemetry & Privacy](../../docs/ddd/bounded-contexts/09-telemetry-privacy.md) context.

From a23bcca858a8bfc00f1b9e337227945445fac534 Mon Sep 17 00:00:00 2001
From: Tovli <Dekel@tovli.co.il>
Date: Mon, 15 Jun 2026 17:29:36 +0300
Subject: [PATCH 2/2] docs(readme): add architecture/entry-points section and
 collapsible layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add an "Architecture and entry points" section: a three-layer table
  (el-ffi/EdgeLlm device facade → el-core/LlmProvider seam → el-runtime
  orchestrator) and a layered diagram, making the SDK entry point explicit.
- Restructure long sections into collapsible <details> blocks with a Contents
  TOC for readability (Why, decode pipeline, quick-start extras, chat client,
  workspace map, ADRs, domain model, roadmap).
- Fix the stale "workspace-excluded" label on the el-ffi row — el-ffi and
  el-cloud are regular workspace members; only el-grammar-llguidance is
  excluded. Link each crate to its new per-crate README.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md | 182 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 149 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index d19b01f..d3587c0 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,29 @@ privacy-preserving telemetry, and host bindings for mobile and web runtimes.
 Local inference is the default path. Frontier and OpenAI-compatible providers
 exist only behind an explicit opt-in backend.
 
+> **New here?** Jump to [Architecture and entry points](#architecture-and-entry-points)
+> to see where the SDK starts and how the crates fit together, then
+> [Quick start](#quick-start) to build it.
+
+## Contents
+
+- [Why Edge Intelligence?](#why-edge-intelligence)
+- [Architecture and entry points](#architecture-and-entry-points)
+- [Quick start](#quick-start)
+- [Local chat test client](#local-chat-test-client)
+- [Workspace map](#workspace-map)
+- [Architecture decisions](#architecture-decisions)
+- [Domain model](#domain-model)
+- [Roadmap](#roadmap)
+- [Documentation](#documentation)
+
 ## Why Edge Intelligence?
 
 Most mobile LLM stacks start in the cloud and add local features later. This
-project starts at the edge:
+project starts at the edge.
+
+<details>
+<summary><b>The six principles that shape every crate</b></summary>
 
 | Principle | What it means in the SDK |
 |-----------|--------------------------|
@@ -35,7 +54,61 @@ project starts at the edge:
 | **Provable provenance** | Model signatures are verified before a session can be constructed. |
 | **Opt-in egress** | Cloud/frontier providers are a separate adapter and must be wired deliberately by the host app. |
 
-## What It Does
+</details>
+
+## Architecture and entry points
+
+Edge Intelligence is a **hexagonal (ports-and-adapters) workspace, not a single
+monolithic crate**. There is no `edge-intelligence` umbrella crate; coherence
+comes from three layers, each with a clear entry point:
+
+| Layer | Crate · symbol | Where it fits |
+|-------|----------------|---------------|
+| **Device SDK facade** | [`el-ffi`](crates/adapters/el-ffi) · `EdgeLlm` | The composition root shipped to devices. Wires the local engine and opt-in cloud behind one flat API and projects it to React Native (UniFFI/JSI), Flutter (FRB), and Web (wasm-bindgen). **Start here to build an app.** |
+| **Rust API seam** | [`el-core`](crates/el-core) · `LlmProvider` | The single trait every backend implements and every Rust consumer calls (ADR-010). **Start here to embed the SDK in Rust.** |
+| **Orchestrator** | [`el-runtime`](crates/el-runtime) · `InferenceSession` | Composes provenance, memory, safety, and grammar into the decode loop — the engine the providers drive. |
+
+```text
+  Edge device app  ·  Kotlin · Swift · Dart · TypeScript
+          │
+          ▼
+  el-ffi · EdgeLlm                      ← device SDK entry point (composition root)
+          │
+          ▼
+  el-core · LlmProvider (trait)         ← unified Rust API seam (ADR-010)
+          │
+     ┌────┴───────────────┐
+     ▼                     ▼
+  el-engine-candle      el-cloud        ← backends: local (default) · opt-in frontier
+     │
+     ▼
+  el-runtime · InferenceSession         ← orchestrator
+     │   composes
+     ▼
+  el-memory · el-provenance · el-safety · el-grammar
+```
+
+<details>
+<summary><b>Which entry point should I use?</b></summary>
+
+- **Building a mobile or web app** → use [`el-ffi`](crates/adapters/el-ffi)'s
+  `EdgeLlm`. Construct `EdgeLlm::local(model_uri)` (air-gapped) or
+  `EdgeLlm::cloud(model, api_key)` (opt-in), then call `ask(...)` /
+  `ask_stream(...)`. The crate compiles to a native library and a wasm package
+  and ships generated TypeScript/Dart bindings.
+- **Embedding the SDK in Rust** → construct a concrete provider and talk to it
+  through [`el_core::LlmProvider`](crates/el-core): `el_engine_candle::QwenChatProvider`
+  for on-device chat, or `el_cloud::CloudProvider` for a frontier backend.
+  [`apps/el-chat`](apps/el-chat) is a worked example.
+- **Extending the SDK** (new engine, grammar, safety, or compression) →
+  implement the matching port trait from [`el-runtime`](crates/el-runtime)
+  (`InferenceEngine`, `GrammarMasker`, `SafetySteerer`, `PromptCompressor`),
+  or implement `LlmProvider` directly for a whole new backend.
+
+</details>
+
+<details>
+<summary><b>The per-token decode pipeline and SDK seams</b></summary>
 
 ```text
 Host app
@@ -56,7 +129,7 @@ load gate -> memory plan -> prefill -> decode loop
                          content-free events and metrics
 ```
 
-The current workspace proves the main seams of the SDK:
+The workspace proves the main seams of the SDK:
 
 - **Runtime orchestration:** `el-runtime` owns the session state machine and
   enforces the decode order: grammar mask, safety adjustment, sampling, commit.
@@ -70,12 +143,15 @@ The current workspace proves the main seams of the SDK:
 - **Safety:** `el-safety` provides the tiered policy model and lightweight
   blacklist steering path, with SecDecoding-style model-backed safety tracked as
   follow-up work.
-- **Inference engine seam:** `el-engine-candle` runs a real Candle CPU forward
-  on a toy in-code model and drives the runtime loop end to end.
+- **Inference engine seam:** `el-engine-candle` runs a real Candle CPU forward —
+  a single-projection seam proof plus a real Qwen2 transformer — and drives the
+  runtime loop end to end.
 - **Provider seam:** `el-core::LlmProvider` gives local and frontier backends one
   host-facing API; `el-cloud` implements the opt-in OpenAI-compatible path.
 
-## Quick Start
+</details>
+
+## Quick start
 
 Prerequisite: Rust 1.96 or newer, matching the workspace `rust-version`.
 
@@ -84,13 +160,16 @@ cargo build --workspace
 cargo test --workspace
 ```
 
-Build just the dependency-light local core:
+<details>
+<summary><b>Build just the dependency-light core, or cross-compile to WASM</b></summary>
+
+Build and test only the pure-Rust local core (no Candle, no network):
 
 ```sh
 cargo test -p el-core -p el-memory -p el-telemetry -p el-provenance -p el-safety -p el-runtime -p el-grammar
 ```
 
-Cross-compile the pure Rust core for WASM:
+Cross-compile that core for WASM:
 
 ```sh
 rustup target add wasm32-wasip1 wasm32-unknown-unknown
@@ -98,16 +177,26 @@ rustup target add wasm32-wasip1 wasm32-unknown-unknown
 cargo build --target wasm32-wasip1 -p el-core -p el-memory -p el-telemetry -p el-provenance -p el-safety -p el-runtime -p el-grammar
 ```
 
-## Local Chat Test Client
+Cross-compile the device bindings (`el-ffi`) for Android / iOS / Web via the
+[`Makefile`](Makefile): `make build-android`, `make build-ios`, `make build-wasm`,
+or `make bindings` for all three codegen surfaces.
+
+</details>
+
+## Local chat test client
 
 [`apps/el-chat`](apps/el-chat) is an interactive REPL that holds a real
-multi-turn conversation with a small LLM running **entirely on-device**. Its
-purpose is to exercise the SDK end-to-end, so its only direct dependencies are
-SDK crates (`el-core`, `el-engine-candle`) — it contains no inference, model, or
-tokenizer code of its own. Every reply flows through the ADR-010
-`LlmProvider` seam:
+multi-turn conversation with a small LLM running **entirely on-device**. It
+exists to exercise the SDK end-to-end, so its only direct dependencies are SDK
+crates (`el-core`, `el-engine-candle`) — it contains no inference, model, or
+tokenizer code of its own.
 
-```
+<details>
+<summary><b>Fetch a model and run it</b></summary>
+
+Every reply flows through the ADR-010 `LlmProvider` seam:
+
+```text
 el-chat  →  el_core::LlmProvider  →  el_engine_candle::QwenChatProvider
                                        (real Qwen2 forward via candle-transformers)
                                   →  el_runtime::InferenceSession
@@ -135,17 +224,23 @@ cargo run -p el-chat -- --system "Be terse." --max-tokens 128
 ```
 
 REPL commands: `/reset`, `/system <text>`, `/help`, `/exit`. Other flags:
-`--model`, `--tokenizer`, `--system`, `--max-tokens`. The `models/`
-directory is git-ignored. Decoding is deterministic (the SDK runtime decodes
-greedily), so the same prompt yields the same reply.
+`--model`, `--tokenizer`, `--system`, `--max-tokens`. The `models/` directory is
+git-ignored. See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full
+user guide.
+
+</details>
+
+## Workspace map
 
-See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide.
+Twelve crates plus the chat app. Each crate has its own README (linked below)
+covering its public API, a usage example, and the ADRs it realizes.
 
-## Workspace Map
+<details>
+<summary><b>Show the full crate table</b></summary>
 
 | Crate | Role | Current state |
 |-------|------|---------------|
-| [`crates/el-core`](crates/el-core) | Shared types, IDs, errors, events, provider trait | Implemented and tested |
+| [`crates/el-core`](crates/el-core) | Shared types, IDs, errors, events, `LlmProvider` trait | Implemented and tested |
 | [`crates/el-memory`](crates/el-memory) | Static arena planning and KV-cache descriptors | Implemented and tested |
 | [`crates/el-telemetry`](crates/el-telemetry) | Content-free event handling and privacy metrics | Implemented and tested |
 | [`crates/el-provenance`](crates/el-provenance) | Verified model load permits | Implemented and tested |
@@ -154,16 +249,26 @@ See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide.
 | [`crates/el-grammar`](crates/el-grammar) | DFA grammar masking | Implemented and tested |
 | [`crates/adapters/el-provenance-ed25519`](crates/adapters/el-provenance-ed25519) | Real ED25519 signature verification | Implemented and tested |
 | [`crates/adapters/el-engine-candle`](crates/adapters/el-engine-candle) | Candle inference adapter: engine-seam proof plus a real Qwen2 transformer engine and chat provider | Implemented; real on-device chat |
-| [`crates/adapters/el-cloud`](crates/adapters/el-cloud) | Opt-in OpenAI-compatible provider backend | Implemented as an explicit egress adapter |
+| [`crates/adapters/el-cloud`](crates/adapters/el-cloud) | Opt-in OpenAI-compatible provider backend | Implemented; egress opt-in at construction |
+| [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | **Device SDK facade (`EdgeLlm`):** Flutter / UniFFI / wasm-bindgen binding surfaces | Implemented and tested; host build is a workspace member, cross-target builds via `make` |
 | [`crates/adapters/el-grammar-llguidance`](crates/adapters/el-grammar-llguidance) | llguidance JSON-schema token masking | Implemented and tested; workspace-excluded (crates.io deps) |
-| [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | Flutter/UniFFI/wasm-bindgen binding surfaces | Implemented and tested (native + wasm32 compile); workspace-excluded (cross toolchains) |
 | [`apps/el-chat`](apps/el-chat) | Interactive chat test client; SDK-only deps, drives the runtime end-to-end | Implemented; runs real on-device chat |
 
-## Architecture Decisions
+Of the adapters, only `el-grammar-llguidance` is excluded from the default
+workspace build (it pulls crates.io-only grammar dependencies); `el-cloud` and
+`el-ffi` are regular members whose host targets build and test with
+`cargo test --workspace`.
+
+</details>
+
+## Architecture decisions
 
 The project is intentionally decision-heavy because mobile LLM runtimes are easy
 to overfit to one device, model, or provider. The core choices are recorded as
-ADRs:
+ADRs.
+
+<details>
+<summary><b>The ten architecture decision records</b></summary>
 
 | ADR | Decision |
 |-----|----------|
@@ -180,10 +285,17 @@ ADRs:
 
 See the full index in [`docs/adr/README.md`](docs/adr/README.md).
 
-## Domain Model
+</details>
+
+## Domain model
 
-The DDD model lives in [`docs/ddd`](docs/ddd/README.md). It breaks the SDK into
-nine bounded contexts:
+The DDD model lives in [`docs/ddd`](docs/ddd/README.md). The key invariant
+across its contexts: **air-gap is the default runtime shape, not a feature flag
+sprinkled through the code** — any outbound behavior must be modeled as an
+explicit port or adapter.
+
+<details>
+<summary><b>The nine bounded contexts</b></summary>
 
 1. Inference Runtime
 2. Prompt Compression
@@ -195,14 +307,15 @@ nine bounded contexts:
 8. Model Provenance and Security
 9. Telemetry and Privacy
 
-The key invariant across those contexts: **air-gap is the default runtime shape,
-not a feature flag sprinkled through the code.** Any outbound behavior must be
-modeled as an explicit port or adapter.
+</details>
 
-## What Is Next
+## Roadmap
 
 The prototype has proven the architectural seams. The next engineering work is
-to replace toy proofs with production-grade runtime pieces:
+to replace toy proofs with production-grade runtime pieces.
+
+<details>
+<summary><b>What's next</b></summary>
 
 - Production GGUF/safetensors loading and transformer execution in
   `el-engine-candle`.
@@ -214,11 +327,14 @@ to replace toy proofs with production-grade runtime pieces:
 - On-device benchmarks for time-to-first-token, decode throughput, memory
   high-water marks, and thermal behavior.
 
+</details>
+
 ## Documentation
 
 - Product and technical rationale: [`docs/prd.md`](docs/prd.md)
 - Domain model: [`docs/ddd/README.md`](docs/ddd/README.md)
 - Architecture decisions: [`docs/adr/README.md`](docs/adr/README.md)
+- Per-crate guides: see the [Workspace map](#workspace-map) — every crate links to its own README.
 
 Edge Intelligence is still early, but the direction is deliberate: a small,
 auditable, Rust-native SDK that lets app developers choose local inference first