From 2108fa3bf213cdfca6098360a6ecfa58caffc42c Mon Sep 17 00:00:00 2001 From: Tovli Date: Mon, 15 Jun 2026 17:11:49 +0300 Subject: [PATCH 1/2] docs: add per-crate READMEs and bump workspace to 0.3.8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a README.md to every workspace crate (7 core crates + 5 adapters), each documenting the crate's role, public API, a usage example, workspace placement, status, and links to the ADRs/DDD contexts it realizes — matching the style of apps/el-chat/README.md. Bump the workspace version 0.3.7 -> 0.3.8 via `cargo set-version --workspace` (docs-only change; el-chat inherits, the excluded el-grammar-llguidance is unaffected). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 24 ++--- Cargo.toml | 22 ++--- crates/adapters/el-cloud/README.md | 83 +++++++++++++++++ crates/adapters/el-engine-candle/README.md | 79 ++++++++++++++++ crates/adapters/el-ffi/README.md | 91 ++++++++++++++++++ .../adapters/el-grammar-llguidance/README.md | 70 ++++++++++++++ .../adapters/el-provenance-ed25519/README.md | 57 ++++++++++++ crates/el-core/README.md | 80 ++++++++++++++++ crates/el-grammar/README.md | 66 +++++++++++++ crates/el-memory/README.md | 73 +++++++++++++++ crates/el-provenance/README.md | 63 +++++++++++++ crates/el-runtime/README.md | 92 +++++++++++++++++++ crates/el-safety/README.md | 64 +++++++++++++ crates/el-telemetry/README.md | 61 ++++++++++++ 14 files changed, 902 insertions(+), 23 deletions(-) create mode 100644 crates/adapters/el-cloud/README.md create mode 100644 crates/adapters/el-engine-candle/README.md create mode 100644 crates/adapters/el-ffi/README.md create mode 100644 crates/adapters/el-grammar-llguidance/README.md create mode 100644 crates/adapters/el-provenance-ed25519/README.md create mode 100644 crates/el-core/README.md create mode 100644 crates/el-grammar/README.md create mode 100644 crates/el-memory/README.md create mode 100644 crates/el-provenance/README.md create mode 100644 crates/el-runtime/README.md create mode 100644 crates/el-safety/README.md create mode 100644 crates/el-telemetry/README.md diff --git a/Cargo.lock b/Cargo.lock index 193534e..385d488 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -793,7 +793,7 @@ checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" [[package]] name = "el-chat" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", "el-engine-candle", @@ -801,7 +801,7 @@ dependencies = [ [[package]] name = "el-cloud" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", "reqwest", @@ -812,11 +812,11 @@ dependencies = [ [[package]] name = "el-core" -version = "0.3.7" +version = "0.3.8" [[package]] name = "el-engine-candle" -version = "0.3.7" +version = "0.3.8" dependencies = [ "candle-core", "candle-transformers", @@ -828,7 +828,7 @@ dependencies = [ [[package]] name = "el-ffi" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-cloud", "el-core", @@ -842,7 +842,7 @@ dependencies = [ [[package]] name = "el-grammar" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", "el-provenance", @@ -851,21 +851,21 @@ dependencies = [ [[package]] name = "el-memory" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", ] [[package]] name = "el-provenance" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", ] [[package]] name = "el-provenance-ed25519" -version = "0.3.7" +version = "0.3.8" dependencies = [ "ed25519-dalek", "el-core", @@ -874,7 +874,7 @@ dependencies = [ [[package]] name = "el-runtime" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", "el-memory", @@ -884,14 +884,14 @@ dependencies = [ [[package]] name = "el-safety" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", ] [[package]] name = "el-telemetry" -version = "0.3.7" +version = "0.3.8" dependencies = [ "el-core", ] diff --git a/Cargo.toml b/Cargo.toml index d6902b4..a3f7f4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ exclude = [ [workspace.package] edition = "2021" -version = "0.3.7" +version = "0.3.8" license = "Apache-2.0" rust-version = "1.96" repository = "Edge-Native LLM SDK" @@ -45,16 +45,16 @@ repository = "Edge-Native LLM SDK" # This is the single source of truth for internal dep version requirements; # member crates reference them with `{ workspace = true }` and add no inline # version fields. `cargo set-version --workspace ` updates every entry. -el-core = { path = "crates/el-core", version = "0.3.7" } -el-memory = { path = "crates/el-memory", version = "0.3.7" } -el-telemetry = { path = "crates/el-telemetry", version = "0.3.7" } -el-provenance = { path = "crates/el-provenance", version = "0.3.7" } -el-safety = { path = "crates/el-safety", version = "0.3.7" } -el-runtime = { path = "crates/el-runtime", version = "0.3.7" } -el-grammar = { path = "crates/el-grammar", version = "0.3.7" } -el-provenance-ed25519 = { path = "crates/adapters/el-provenance-ed25519", version = "0.3.7" } -el-engine-candle = { path = "crates/adapters/el-engine-candle", version = "0.3.7" } -el-cloud = { path = "crates/adapters/el-cloud", version = "0.3.7" } +el-core = { path = "crates/el-core", version = "0.3.8" } +el-memory = { path = "crates/el-memory", version = "0.3.8" } +el-telemetry = { path = "crates/el-telemetry", version = "0.3.8" } +el-provenance = { path = "crates/el-provenance", version = "0.3.8" } +el-safety = { path = "crates/el-safety", version = "0.3.8" } +el-runtime = { path = "crates/el-runtime", version = "0.3.8" } +el-grammar = { path = "crates/el-grammar", version = "0.3.8" } +el-provenance-ed25519 = { path = "crates/adapters/el-provenance-ed25519", version = "0.3.8" } +el-engine-candle = { path = "crates/adapters/el-engine-candle", version = "0.3.8" } +el-cloud = { path = "crates/adapters/el-cloud", version = "0.3.8" } [profile.release] opt-level = 3 diff --git a/crates/adapters/el-cloud/README.md b/crates/adapters/el-cloud/README.md new file mode 100644 index 0000000..82fd053 --- /dev/null +++ b/crates/adapters/el-cloud/README.md @@ -0,0 +1,83 @@ +# el-cloud — opt-in frontier LLM cloud backend + +The opt-in frontier cloud backend (ADR-010). It implements +`el_core::LlmProvider` over [`reqwest`](https://crates.io/crates/reqwest) using +the OpenAI Chat Completions API — which OpenAI, Anthropic (compat), Gemini +(compat), Ollama, and any OpenAI-compatible endpoint all speak. + +## Air-gap is preserved (ADR-004) + +This is the project's **only** outbound network surface, and it is opt-in at the +API level: + +- Outbound calls happen **only** when an app *explicitly constructs* a + `CloudProvider`. Apps that never build this type have zero outbound surface, + even though the crate compiles as part of the workspace. +- The `reqwest` network dependency is gated to non-wasm32 targets (see the + wasm32 note below), so the web surface has no blocking HTTP transport at all. +- Every consultation emits a content-free `DomainEvent::FrontierLlmConsulted` + (the `provider_hash` is a CRC32 of the provider prefix — never the API key or + any content). + +## What it provides + +- **`CloudProvider`** — owns a pooled `reqwest::blocking::Client` with explicit + timeouts (10 s connect, 60 s idle/per-read, 120 s total per non-streaming + request) so a stalled provider can never block an FFI caller indefinitely. + - `new()` / `Default` + - `with_event_sink(|event| …)` — register a callback for `DomainEvent`s (e.g. + to feed an [`el_telemetry::MetricsCollector`](../../el-telemetry)). + - implements `LlmProvider::chat` (blocking) and `chat_stream` (SSE). + +### Provider routing (by model prefix) + +| Model string | Base URL | +|--------------|----------| +| `openai/` | `https://api.openai.com/v1` | +| `anthropic/` | `https://api.anthropic.com/v1` (compat) | +| `gemini/` | `https://generativelanguage.googleapis.com/v1beta/openai` | +| `ollama/` | `http://localhost:11434/v1` (no key required) | +| `http(s)://…/` | custom base URL | + +The streaming path parses Server-Sent Events strictly: provider error payloads +and malformed chunks fail the call (a half-finished stream is never reported as +a clean completion), and errors carry only parse category/position/size — +**never** echoed content, per the el-core error contract. + +## Usage + +```rust +use el_core::{ChatMessage, ChatRequest, CredentialRef, LlmProvider}; +use el_cloud::CloudProvider; + +let provider = CloudProvider::new(); + +// The credential is resolved at runtime from the platform keystore — never embedded. +let req = ChatRequest::new("openai/gpt-4o", vec![ChatMessage::user("Hello!")]) + .with_credential(CredentialRef::new(api_key_from_keystore)) + .with_max_tokens(256); + +let resp = provider.chat(&req)?; +println!("{}", resp.content); +# Ok::<(), el_core::EdgeError>(()) +``` + +## wasm32 note + +`reqwest::blocking` is unavailable on `wasm32-unknown-unknown` (no threads), and +the synchronous `LlmProvider::chat` cannot await the browser's `fetch`. The +network modules are therefore gated to non-wasm32, and the web binding +(`el-ffi`) exposes a throwing `cloud` constructor instead of silently degrading +(ADR-010 amendment). + +## Status + +Implemented as an explicit egress adapter and a regular workspace member — the +opt-in is enforced at construction, not by excluding it from the build. Uses +`rustls-tls` so Android cross-compiles need no target OpenSSL sysroot. + +--- + +Part of the [Edge Intelligence](../../../README.md) workspace. Realizes +[ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md) +while preserving [ADR-004](../../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md). diff --git a/crates/adapters/el-engine-candle/README.md b/crates/adapters/el-engine-candle/README.md new file mode 100644 index 0000000..08a12b9 --- /dev/null +++ b/crates/adapters/el-engine-candle/README.md @@ -0,0 +1,79 @@ +# el-engine-candle — Candle inference engine adapter + +The inference-engine adapter over [Candle](https://github.com/huggingface/candle) +(ADR-002). It implements the runtime's `InferenceEngine` port (`RuntimeAcl`) and +the `LlmProvider` trait (ADR-010), so the same `el_runtime::InferenceSession` +decode loop drives everything — nothing in the SDK pipeline is bypassed. + +Float logits are quantised to integer milli-logits at the anti-corruption-layer +boundary, so Candle's `Tensor`/`Device` types never cross into the domain. No +`unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +This crate ships two engines and two providers, from "seam proof" to +"real on-device chat": + +| Type | What it is | +|------|------------| +| `CandleEngine` | The **engine-seam proof**: one real Candle forward, `embed[last] · w_out` — a single linear projection. `toy()` builds deterministic synthetic weights; `from_path`/`from_bytes` load `token_embd.weight` + `output.weight` from a GGUF. Transformer blocks, attention, RoPE, and norms are *ignored* — logits won't match a real model. | +| `LocalLlmProvider` | Wraps `CandleEngine` behind `LlmProvider` with a byte-level tokenizer. Good for exercising the binding layer end-to-end. | +| `QwenEngine` | A **real Qwen2 transformer** `InferenceEngine` via `candle-transformers`, holding Candle's stateful KV cache. | +| `QwenChatProvider` | A **real local chat backend**: a Qwen2 GGUF + its `tokenizer.json`, rendered to Qwen2.5 ChatML and driven through the standard provenance-gated session. This is what powers [`apps/el-chat`](../../../apps/el-chat). | + +### Expected GGUF tensor names (`CandleEngine`) + +- `token_embd.weight` — embedding table `[vocab, dim]` +- `output.weight` or `lm_head.weight` — lm-head `[vocab, dim]` (standard Llama layout) + +Mismatched shapes are rejected at load time, not silently at inference. + +## Usage + +```rust +use el_core::{ChatMessage, ChatRequest, LlmProvider}; +use el_engine_candle::QwenChatProvider; + +// Real on-device chat: a local Qwen2 GGUF + its tokenizer (no network egress). +let provider = QwenChatProvider::from_paths( + "models/qwen2.5-0.5b-instruct-q4_k_m.gguf", + "models/qwen2.5-0.5b-instruct.tokenizer.json", +)?; + +let req = ChatRequest::new("local", vec![ChatMessage::user("What is the capital of France?")]) + .with_max_tokens(128); +let reply = provider.chat(&req)?; +println!("{}", reply.content); +# Ok::<(), el_core::EdgeError>(()) +``` + +Each `chat` call builds a fresh `QwenEngine` (Candle exposes no public KV-cache +reset) and runs the standard SDK path: provenance permit → `load_prompt` +(prefill) → `generate` (grammar mask → safety steer → greedy commit). Decoding +is deterministic greedy argmax, so replies are reproducible. + +## Benchmark instrumentation + +Setting `EL_BENCH=1` makes `QwenChatProvider::chat` print a per-phase breakdown +(model load / tokenize / prefill / decode / detokenize) plus per-forward +attribution (model compute vs. seam quantisation vs. runtime loop) to stderr. +It is zero-cost when unset and is a diagnostic only — not part of public behaviour. + +## Features & dependencies + +- `candle-core` + `candle-transformers` (0.8), and `tokenizers` 0.21 with the + **pure-Rust `fancy-regex`** backend (no C/C++ `onig`/`esaxx`, per ADR-008). +- `metal` feature → enables `candle-core/metal` for Apple GPUs. + +## Status + +Implemented; runs real on-device chat. The `CandleEngine` linear projection is +the ADR-002 engine-seam proof; `QwenEngine`/`QwenChatProvider` are the real +transformer path. + +--- + +Part of the [Edge Intelligence](../../../README.md) workspace. Realizes +[ADR-002](../../../docs/adr/ADR-002-candle-as-rust-native-inference-engine.md) +and [ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md); +see the [Inference Runtime](../../../docs/ddd/bounded-contexts/01-inference-runtime.md) context. diff --git a/crates/adapters/el-ffi/README.md b/crates/adapters/el-ffi/README.md new file mode 100644 index 0000000..1172d87 --- /dev/null +++ b/crates/adapters/el-ffi/README.md @@ -0,0 +1,91 @@ +# el-ffi — host bindings (React Native, Flutter, Web) + +One Rust API surface exported three ways, so mobile and web apps can call the +SDK in their native idiom (ADR-001, ADR-009, ADR-010): + +| Surface | Tool | Output | +|---------|------|--------| +| **React Native** | `uniffi-bindgen-react-native` | TypeScript + JSI C++ + Turbo Module | +| **Flutter** | `flutter_rust_bridge` v2 | Dart opaque handle, `Future`/`Stream` | +| **Web / npm** | `wasm-bindgen` | ESM TypeScript package via `wasm-pack` | + +No `unsafe` (`#![forbid(unsafe_code)]`). The crate is `cdylib` + `staticlib` + +`lib` so each toolchain can link the form it needs. + +## What it provides + +- **`EdgeLlm`** — the flat, FFI-friendly facade, annotated for all three + surfaces at once: + - `EdgeLlm::local(model_uri)` — local Candle engine, air-gapped (ADR-002/004). + An empty `model_uri` uses a deterministic toy model for development; + a path loads a consumer-supplied GGUF. + - `EdgeLlm::cloud(model, api_key)` — frontier cloud backend (opt-in, ADR-010). + **Native only** — see the web limitation below. `api_key` must come from the + platform keystore, never embedded. + - `ask(prompt) -> Result` — blocking chat. + - `ask_stream(prompt, |token| …)` — closure streaming (Flutter / FRB v2). + - `ask_stream_cb(prompt, handler)` — `StreamHandler` callback streaming + (React Native; UniFFI cannot export `impl FnMut`). + - `reset()`. +- **`SdkError`** — a thin, FFI-safe projection of `el_core::EdgeError` + (`el-core`'s `Box`/Rust-specific variants are not FFI-safe). Projects to + the host language's exception type, or a JS exception on wasm. +- **`StreamHandler`** — the React Native streaming callback interface. + +## Usage (Rust side) + +```rust +use el_ffi::EdgeLlm; + +// Empty path → deterministic toy model; exercises the full binding layer. +let sdk = EdgeLlm::local(String::new())?; +let reply = sdk.ask("hello".into())?; +assert!(!reply.is_empty()); + +// Streaming (Flutter / closure form): +sdk.ask_stream("hi".into(), |fragment| print!("{fragment}"))?; +# Ok::<(), el_ffi::SdkError>(()) +``` + +## Building the bindings + +The Rust binding *surfaces* compile on the host; the cross-target builds and +codegen run via the [`Makefile`](../../../Makefile): + +```sh +make build-android # cargo build --target aarch64-linux-android (shared lib) +make build-ios # cargo build --target aarch64-apple-ios (static lib) +make build-wasm # wasm-pack build → out/web ESM package + +make codegen-rn # React Native JSI bindings (needs build-android) +make codegen-flutter # flutter_rust_bridge v2 Dart bindings +make bindings # all three surfaces +``` + +Prerequisites (rustup targets, NDK linker, `wasm-pack`, +`uniffi-bindgen-react-native`, `flutter_rust_bridge_codegen`) are documented in +the Makefile header. + +## Web limitations + +On `wasm32` the local path currently uses a dev-stage echo placeholder until +Candle-on-wasm is wired, and the **cloud backend is not available on web** +(ADR-010 amendment): `el-cloud`'s blocking HTTP transport has no wasm +implementation, so `EdgeLlm.cloud` throws an explicit error there instead of +silently degrading. Use a native binding (React Native / Flutter) for cloud +access. + +## Status + +Implemented and tested (native + `wasm32` compile). As a workspace member, the +host-target Rust surfaces build and test with the rest of the workspace +(`cargo test --workspace`); the Android / iOS / wasm cross-builds and binding +codegen run separately via the Makefile because those toolchains are installed +out-of-band. + +--- + +Part of the [Edge Intelligence](../../../README.md) workspace. Realizes +[ADR-001](../../../docs/adr/ADR-001-adopt-webassembly-as-cross-platform-sdk-runtime.md), +[ADR-009](../../../docs/adr/ADR-009-flutter-rust-bridge-for-dart-bindings.md), +and [ADR-010](../../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md). diff --git a/crates/adapters/el-grammar-llguidance/README.md b/crates/adapters/el-grammar-llguidance/README.md new file mode 100644 index 0000000..b280cc6 --- /dev/null +++ b/crates/adapters/el-grammar-llguidance/README.md @@ -0,0 +1,70 @@ +# el-grammar-llguidance — JSON-schema grammar masking over llguidance + +The production grammar masker (ADR-004): real **JSON-schema** token masking over +[llguidance](https://github.com/guidance-ai/llguidance), bridged to a HuggingFace +tokenizer. It is the scale-up of the pure-Rust, regular-grammar +[`el-grammar`](../../el-grammar) — both implement the same +`el_runtime::GrammarMasker` port, so the runtime is agnostic to which is wired. + +The grammar FSM is built once from a JSON schema at construction time and +advanced per committed token during generation (~50 µs/mask for a 128k vocab). +The HuggingFace `tokenizers::Tokenizer` is bridged to llguidance's `TokEnv` via +`toktrie_hf_tokenizers` (the official guidance-ai integration crate). No +`unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +- **`LlguidanceMasker`** — implements `el_runtime::GrammarMasker`: + - `from_tokenizer(&tokenizer, schema_json)` — build a masker from a HF + tokenizer and a JSON-schema string. + - `mask(recent, vocab)` — advances the FSM over newly committed tokens and + returns the allowed-token mask. Once the grammar is satisfied/exhausted (or + a token violates it and the parser cannot recover) it falls back to + allow-all from there on. + - `reset()` — rebuild a fresh parser for the same schema. + +## Usage + +```rust +use el_grammar_llguidance::LlguidanceMasker; +use el_runtime::GrammarMasker; + +let tokenizer = tokenizers::Tokenizer::from_file("tokenizer.json").unwrap(); +let masker = LlguidanceMasker::from_tokenizer(&tokenizer, r#"{"type":"integer"}"#)?; + +// In the decode loop, wired into the session's Ports: +let allow_mask = masker.mask(&committed_tokens, vocab_size); +# Ok::<(), el_core::EdgeError>(()) +``` + +## Building (workspace-excluded) + +This crate depends on `llguidance` + `toktrie_hf_tokenizers` (which require +crates.io) and a native tokenizer build, so it is **excluded from the offline +workspace** and declares its own empty `[workspace]` table to build standalone: + +```sh +cargo build --manifest-path crates/adapters/el-grammar-llguidance/Cargo.toml +cargo test --manifest-path crates/adapters/el-grammar-llguidance/Cargo.toml +``` + +### API-version note + +The call sites target the llguidance / toktrie **1.7** line (llguidance 1.7 + +toktrie_hf_tokenizers 1.7 + tokenizers 0.21 — they must share one `toktrie`). If +a build fails with an API mismatch on `ByteTokenizer`, `ParserFactory`, or +`TokenParser`, the relevant call sites are marked with `// llg-api:` comments. + +> Note: this crate uses the `onig` tokenizer backend (unlike `el-engine-candle`, +> which uses the pure-Rust `fancy-regex` backend per ADR-008). + +## Status + +Implemented and tested (integer-schema masking constrains digit tokens; +allow-all fallback on exhaustion). Lark/CFG grammars are tracked follow-up work. + +--- + +Part of the [Edge Intelligence](../../../README.md) workspace (excluded build). +Realizes [ADR-004](../../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md); +see the [Grammar Constraint](../../../docs/ddd/bounded-contexts/04-grammar-constraint.md) context. diff --git a/crates/adapters/el-provenance-ed25519/README.md b/crates/adapters/el-provenance-ed25519/README.md new file mode 100644 index 0000000..489f2d8 --- /dev/null +++ b/crates/adapters/el-provenance-ed25519/README.md @@ -0,0 +1,57 @@ +# el-provenance-ed25519 — real ED25519 signature verifier + +The production `SignatureVerifier` for the model-provenance load gate (ADR-006), +implemented over [`ed25519-dalek`](https://crates.io/crates/ed25519-dalek) v2. + +It plugs into the gate *logic* in [`el-provenance`](../../el-provenance): this +adapter answers "is this signature valid?", and `el-provenance` decides what to +do about it (issue a `LoadPermit`, or hard-stop). Pure-Rust dependency tree, no +`unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +- **`Ed25519Verifier`** — verifies model signatures against a set of trusted + provider public keys, keyed by `public_key_id` (the trust-anchor reference + from ADR-006): + - `new()` / `Default` + - `register(id, key_bytes: [u8; 32])` — register a trusted public key + - implements `el_provenance::SignatureVerifier::verify`, which **rejects** + (returns `false`) on an unknown key id, a malformed signature, or a + verification failure — every failure mode is a hard stop upstream. + +## Usage + +```rust +use el_core::{ModelFormat, ModelId, ModelVersion}; +use el_provenance::ModelArtifact; +use el_provenance_ed25519::Ed25519Verifier; + +let mut verifier = Ed25519Verifier::new(); +verifier.register(/* public_key_id */ 1, trusted_public_key_bytes)?; + +let mut artifact = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf); +artifact.verify(&verifier, model_bytes, signature_bytes, 1); + +// Verified → a LoadPermit; tampered bytes, a forged signature, or an unknown +// key id → a hard error with no fallback. +let permit = artifact.ensure_loadable()?; +# Ok::<(), Box>(()) +``` + +## Why it's a separate adapter + +The core gate lives in `el-provenance` with **zero dependencies** so it +cross-compiles everywhere and stays unit-testable offline with doubles. This +adapter isolates the one crates.io dependency (`ed25519-dalek`) behind the +`SignatureVerifier` seam. It is a regular workspace member. + +## Status + +Implemented and tested — genuine signatures verify; tampering, forged +signatures, malformed signatures, and unknown key ids are all rejected. + +--- + +Part of the [Edge Intelligence](../../../README.md) workspace. Realizes +[ADR-006](../../../docs/adr/ADR-006-mandatory-ed25519-model-signature-verification-load-gate.md); +see the [Model Provenance](../../../docs/ddd/bounded-contexts/08-model-provenance.md) context. diff --git a/crates/el-core/README.md b/crates/el-core/README.md new file mode 100644 index 0000000..67cf956 --- /dev/null +++ b/crates/el-core/README.md @@ -0,0 +1,80 @@ +# el-core — shared domain vocabulary + +The foundational crate of the Edge Intelligence SDK: the *ubiquitous language* +of the project turned into Rust types. Every other crate speaks in terms of the +ids, value objects, errors, events, and the provider trait defined here. + +`el-core` has **zero external dependencies** — pure `std`, so it compiles +offline on any target including `wasm32` (ADR-008). It contains no I/O, no +network, and no `unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +| Module | Key types | Purpose | +|--------|-----------|---------| +| `ids` | `SessionId`, `ModelId`, `ModelVersion` | Identifier value objects | +| `value_objects` | `Token`, `ModelFormat`, `RuntimeKind`, `DeviceTarget`, `Phase`, `SafetyMode`, `SpeculationMode`, `StopReason` | Core enums and the `Token = u32` alias | +| `config` | `SessionConfig` | Immutable per-session configuration | +| `error` | `EdgeError`, `Result` | The SDK-wide error type | +| `events` | `DomainEvent`, `EventEnvelope`, `DegradeReason` | Content-free domain events | +| `provider` | `LlmProvider`, `ChatRequest`, `ChatResponse`, `ChatMessage`, `ChatRole`, `ChatToken`, `CredentialRef` | The unified backend abstraction | + +All names are re-exported at the crate root, e.g. `use el_core::{LlmProvider, ChatRequest};`. + +## Cross-cutting invariants encoded here + +These are enforced by the type system, not by convention: + +- **Content-free events (ADR-007).** `DomainEvent` and `EventEnvelope` derive + `Copy`. A `String`/`Vec`/heap field is not `Copy`, so adding one fails to + compile — "no prompt or response content on an event" is a *compile-time* + guarantee. Ratios and scores are carried as fixed-point integers (`*_milli`). +- **Air-gap by default (ADR-004).** `SessionConfig::default().hybrid_mode` is + `false`. The only network seam is an explicit opt-in. +- **Unified provider (ADR-010).** `LlmProvider` covers both the local Candle + engine and cloud frontier backends behind one trait, so host apps can swap + local ↔ frontier without touching their UI. +- **Redacted credentials.** `CredentialRef`'s `Debug` output is + `CredentialRef([REDACTED])`, so bearer keys cannot leak into logs or panic + messages. + +## Usage + +```rust +use el_core::{ChatMessage, ChatRequest, LlmProvider, SessionConfig}; + +// Configuration is air-gapped by default (ADR-004). +let cfg = SessionConfig::default(); +assert!(!cfg.hybrid_mode); + +// A backend-agnostic request. `model` is a routing hint: +// "local"/"" → local engine, "openai/…", "anthropic/…", "ollama/…", "gemini/…" +let req = ChatRequest::new("local", vec![ChatMessage::user("Hello!")]) + .with_max_tokens(256) + .with_temperature(700); // 700 = 0.7 (milli to keep the type Eq-able) + +// Any backend is reached through the same trait. +fn run(provider: &dyn LlmProvider, req: &ChatRequest) -> el_core::Result { + Ok(provider.chat(req)?.content) +} +``` + +## Place in the workspace + +`el-core` is the root of the dependency graph: `el-memory`, `el-telemetry`, +`el-provenance`, `el-safety`, `el-runtime`, and every adapter depend on it, and +it depends on nothing. Keep it dependency-free — that property is what lets the +local core cross-compile to WASM and mobile targets. + +## Status + +Implemented and tested. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md), +[ADR-007](../../docs/adr/ADR-007-content-free-domain-events-privacy-by-construction-telemetry.md), +[ADR-008](../../docs/adr/ADR-008-implement-the-sdk-in-rust-instead-of-c-cpp.md), +and [ADR-010](../../docs/adr/ADR-010-unified-llm-provider-trait-with-opt-in-frontier-egress.md). +The vocabulary mirrors [`docs/ddd/ubiquitous-language.md`](../../docs/ddd/ubiquitous-language.md). diff --git a/crates/el-grammar/README.md b/crates/el-grammar/README.md new file mode 100644 index 0000000..377f2d2 --- /dev/null +++ b/crates/el-grammar/README.md @@ -0,0 +1,66 @@ +# el-grammar — pure-Rust DFA token masking + +Grammar-constrained decoding via **DFA token masking**, in pure Rust (the +Grammar Constraint context). A grammar is compiled to a deterministic automaton +over **token ids** (the alphabet). Each decode step, the masker replays the +committed tokens to find the current state and allows only tokens with a valid +transition — the exact token-level masking mechanism that XGrammar/llguidance +implement, here for *regular* grammars with no external dependencies. + +Depends on `el-core` and `el-runtime`. No `unsafe` (`#![forbid(unsafe_code)]`). + +## Scope + +This crate covers **regular grammars over an explicit token alphabet**. Full +context-free / JSON-schema grammars with a real tokenizer environment are the +production scale-up in +[`el-grammar-llguidance`](../adapters/el-grammar-llguidance). Both implement the +same `el_runtime::GrammarMasker` port, so the runtime is agnostic to which one +is wired. + +## What it provides + +- **`Dfa`** — a deterministic finite automaton over `Token` ids, with a builder + API: `Dfa::new(start).transition(from, token, to).accept(state)`. Plus + `step`, `run` (replay a token sequence), and `is_accepting`. +- **`DfaMasker`** — wraps a `Dfa` and implements `el_runtime::GrammarMasker`. + `mask(recent, vocab)` returns a per-token allow mask; a dead (invalid) state + yields an all-`false` mask (nothing legal). `accepts(committed)` reports + whether stopping now would be valid. +- **`StateId`** — `u32` alias for automaton states. + +## Usage + +```rust +use el_grammar::{Dfa, DfaMasker}; +use el_runtime::GrammarMasker; + +// A grammar that accepts exactly the token sequence 5, 5, 9. +let dfa = Dfa::new(0) + .transition(0, 5, 1) + .transition(1, 5, 2) + .transition(2, 9, 3) + .accept(3); +let masker = DfaMasker::new(dfa); + +// At the start, only token 5 is legal. +let mask = masker.mask(&[], 10); +assert!(mask[5] && !mask[9]); + +// Wired into a session's Ports, this forces grammar-valid output even when the +// engine's raw logits would prefer something else. +assert!(masker.accepts(&[5, 5, 9])); +``` + +Drop it into a session via `Ports { grammar: Box::new(masker), ..Ports::permissive() }`. + +## Status + +Implemented and tested, including an end-to-end test that constrains real +decoding inside `el_runtime::InferenceSession`. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace; see the +[Grammar Constraint](../../docs/ddd/bounded-contexts/04-grammar-constraint.md) +context and [ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md). diff --git a/crates/el-memory/README.md b/crates/el-memory/README.md new file mode 100644 index 0000000..d2ea895 --- /dev/null +++ b/crates/el-memory/README.md @@ -0,0 +1,73 @@ +# el-memory — static memory planning + +Ahead-of-time memory planning, a single contiguous arena allocated once, and +descriptor-only KV-cache compaction (ADR-003). The discipline — reproduced from +ExecuTorch's technique, in pure Rust — is to assign every tensor a fixed offset +*before* inference so the decode loop performs **no heap allocation**, and to +resolve KV pruning by shuffling descriptors rather than copying data. + +Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +- **`StaticMemoryPlanner::plan(tensors, budget_bytes)`** — interval-colours + tensor lifetimes per memory tier so that tensors whose lifetimes do **not** + overlap reuse the same offset. Returns a `MemoryPlan`, or + `EdgeError::MemoryBudgetExceeded` when the packed plan does not fit the budget + (the signal the runtime uses to disable optional features or spill). +- **`MemoryPlan`** — the immutable allocation: `placements()`, `placement(id)`, + `sram_bytes()`, `dram_bytes()`, `total_bytes()`. +- **`TensorSpec`** / **`BufferLifetime`** / **`MemoryTier`** (`Sram` | `Dram`) / + **`Placement`** / **`TensorId`** / **`TensorOffset`** — the planner's inputs + and outputs. +- **`Arena`** — a contiguous byte buffer allocated once at session init; the + decode loop borrows planned sub-slices via `region()` / `region_mut()` and + never allocates again. +- **`KvRegion`** / **`KvSlot`** — the KV cache addressed through descriptors. + `compact()` removes pruned descriptors and re-indexes survivors **without + moving payload bytes** — survivors keep their original `offset`. + +## Usage + +```rust +use el_memory::{Arena, BufferLifetime, KvRegion, MemoryTier, StaticMemoryPlanner, TensorSpec}; + +// Two same-tier tensors with disjoint lifetimes share one offset, so the tier +// needs max(size), not the sum. +let tensors = [ + TensorSpec { id: 1, size: 100, tier: MemoryTier::Dram, lifetime: BufferLifetime::new(0, 2) }, + TensorSpec { id: 2, size: 100, tier: MemoryTier::Dram, lifetime: BufferLifetime::new(3, 5) }, +]; +let plan = StaticMemoryPlanner::plan(&tensors, 1024 * 1024)?; +assert_eq!(plan.dram_bytes(), 100); // reused, not 200 + +// Allocate the arena once; the decode loop borrows planned regions. +let mut arena = Arena::new(plan.total_bytes() as usize); +if let Some(region) = arena.region_mut(0, 100) { /* fill weights */ } + +// KV compaction shuffles descriptors only — no data copy. +let mut kv = KvRegion::new(); +kv.push(0); // token 0 @ offset 0 +kv.push(64); // token 1 @ offset 64 +kv.mark_pruned(0); +let reclaimed = kv.compact(); // 1; the survivor keeps offset 64 +# Ok::<(), el_core::EdgeError>(()) +``` + +## Place in the workspace + +Used by `el-runtime` (the session owns a `KvRegion`). The planner's +`MemoryBudgetExceeded` error and the `MemoryPlanCreated`/`KvCacheCompacted` +domain events tie this crate to the Telemetry & Privacy context. + +## Status + +Implemented and tested. The `Arena` models the allocate-once + fixed-offset +contract in safe Rust; true OS page-alignment and DMA wiring are a platform +concern handled when a real engine path is attached. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-003](../../docs/adr/ADR-003-static-memory-planning-with-zero-allocation-arena.md); +see the [Memory Management](../../docs/ddd/bounded-contexts/06-memory-management.md) context. diff --git a/crates/el-provenance/README.md b/crates/el-provenance/README.md new file mode 100644 index 0000000..25c8fa8 --- /dev/null +++ b/crates/el-provenance/README.md @@ -0,0 +1,63 @@ +# el-provenance — model-signature load gate + +The hard model-signature load gate (ADR-006). This crate owns the *decision +logic*: a `ModelArtifact` must reach `Verified` before a `LoadPermit` is issued, +and a missing or failing signature is a **hard stop with no fallback**. + +The actual ED25519 maths is abstracted behind the `SignatureVerifier` trait so +the gate is testable offline. The real `ed25519-dalek` implementation lives in +the [`el-provenance-ed25519`](../adapters/el-provenance-ed25519) adapter. + +Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +- **`SignatureVerifier`** — the abstracted primitive: + `verify(bytes, signature, public_key_id) -> bool`. Implemented for real by the + ed25519 adapter and by test doubles. +- **`ModelArtifact`** — a model file plus its provenance metadata + (`id`, `version`, `format`, `status`). `verify(...)` transitions the status + *before* any load/mmap; `ensure_loadable()` is the gate itself. +- **`LoadPermit`** — the capability token proving an artifact passed the gate. + It **cannot be constructed except via `ensure_loadable()`**, so a model that + has not been verified cannot reach the runtime — `el_runtime::InferenceSession` + requires a `LoadPermit` to be built (the Conformist relationship, enforced in + the type system). +- **`VerificationStatus`** — `Unverified` | `Verified` | `Rejected`. + +## Usage + +```rust +use el_core::{ModelFormat, ModelId, ModelVersion}; +use el_provenance::{ModelArtifact, SignatureVerifier}; + +// Plug in a real verifier (el-provenance-ed25519) or a test double. +struct AlwaysOk; +impl SignatureVerifier for AlwaysOk { + fn verify(&self, _bytes: &[u8], _sig: &[u8], _key_id: u32) -> bool { true } +} + +let mut artifact = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf); +artifact.verify(&AlwaysOk, b"", b"", /* public_key_id */ 7); + +// No verified signature → no permit → no session. +let permit = artifact.ensure_loadable()?; // Err(UnverifiedModel | SignatureRejected) otherwise +# Ok::<(), el_core::EdgeError>(()) +``` + +## Place in the workspace + +`el-runtime` accepts a `LoadPermit` (not raw bytes) to construct a session, so +this gate sits in front of every inference path. The ed25519 adapter provides +the production `SignatureVerifier`. + +## Status + +Implemented and tested. The gate *logic* is fully covered here; real signature +verification is provided by the ed25519 adapter. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-006](../../docs/adr/ADR-006-mandatory-ed25519-model-signature-verification-load-gate.md); +see the [Model Provenance](../../docs/ddd/bounded-contexts/08-model-provenance.md) context. diff --git a/crates/el-runtime/README.md b/crates/el-runtime/README.md new file mode 100644 index 0000000..727eb12 --- /dev/null +++ b/crates/el-runtime/README.md @@ -0,0 +1,92 @@ +# el-runtime — session lifecycle & decode-loop orchestration + +The Core of the SDK: the inference session state machine, the port traits that +collaborator contexts plug into, and the decode-loop orchestrator (ADR-001). + +Air-gap is **structural** here (ADR-004): this crate has no network dependency, +and the only outbound seam is the opt-in `HybridRelay` port. No `unsafe` +(`#![forbid(unsafe_code)]`). + +## The decode-step invariant + +Every decode step composes its collaborators in a fixed, invariant order: + +``` +grammar mask → safety adjust → sample (greedy argmax) → commit KV +``` + +Grammar runs *before* safety, so safety steering only ever operates over +already-legal tokens. This ordering is enforced in `InferenceSession::generate` +and covered by tests. + +## What it provides + +- **`InferenceSession`** — the aggregate root. Constructing + it **requires a `LoadPermit`** (`el-provenance`), so an unverified model cannot + reach the runtime. Drives the `Initialized → Prefilling → Decoding → Completed` + phases via `load_prompt()`, `generate()`, `reset()`, and emits content-free + `EventEnvelope`s (`drain_events()`). +- **Port traits** (the collaborator seams): + - `InferenceEngine` — `prefill`, `next_logits` (integer milli-logits), `eos_token`. + - `PromptCompressor` — optional LLMLingua-2-style compression. + - `GrammarMasker` — per-token allow mask. + - `HybridRelay` — opt-in **LAN-only** relay; there is no cloud variant. +- **`Ports`** — the collaborator bundle bound to a session; `Ports::permissive()` + gives identity compression, allow-all grammar, no safety, and no relay. +- **Defaults** — `IdentityCompressor`, `AllowAllMasker`, and `NullEngine` (emits + EOS right after prefill) let you exercise the full lifecycle without any + external adapter. +- **Re-exports** `el_safety::{SafetySteerer, LogitAdjustment}` so callers wire + one type system. + +`consult_relay()` hard-fails with `EdgeError::AirGapViolation` unless +`hybrid_mode` is enabled **and** a relay is wired (ADR-004). + +## Usage + +```rust +use el_core::{SessionConfig, SessionId, StopReason}; +use el_provenance::{ModelArtifact, SignatureVerifier}; +use el_core::{ModelFormat, ModelId, ModelVersion}; +use el_runtime::{InferenceSession, NullEngine, Ports}; + +// A LoadPermit is required to build a session (ADR-006). +struct Ok_; impl SignatureVerifier for Ok_ { fn verify(&self, _: &[u8], _: &[u8], _: u32) -> bool { true } } +let mut art = ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), ModelFormat::Gguf); +art.verify(&Ok_, b"w", b"s", 1); +let permit = art.ensure_loadable()?; + +let mut session = InferenceSession::new( + SessionId(1), + SessionConfig::default(), + NullEngine::new(/* eos */ 3, /* vocab */ 8), + permit, +); + +let ports = Ports::permissive(); +session.load_prompt(&ports, &[10, 11, 12])?; // compress → prefill → KV +let stop = session.generate(&ports, 16)?; // runs the decode loop +assert_eq!(stop, StopReason::Eos); +# Ok::<(), el_core::EdgeError>(()) +``` + +A real engine plugs into `InferenceEngine` (see +[`el-engine-candle`](../adapters/el-engine-candle)); a real grammar plugs into +`GrammarMasker` (see [`el-grammar`](../el-grammar) and +[`el-grammar-llguidance`](../adapters/el-grammar-llguidance)). + +## Place in the workspace + +Depends on `el-core`, `el-memory` (owns a `KvRegion`), `el-safety`, and +`el-provenance`. It is the hub every adapter wires into. + +## Status + +Implemented and tested. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-001](../../docs/adr/ADR-001-adopt-webassembly-as-cross-platform-sdk-runtime.md) +and [ADR-004](../../docs/adr/ADR-004-air-gapped-by-default-with-opt-in-hybrid-mode.md); +the decode order is specified in [`docs/ddd/domain-events.md`](../../docs/ddd/domain-events.md). diff --git a/crates/el-safety/README.md b/crates/el-safety/README.md new file mode 100644 index 0000000..8790b7e --- /dev/null +++ b/crates/el-safety/README.md @@ -0,0 +1,64 @@ +# el-safety — on-device decoder-time safety + +On-device, tiered, decoder-time safety (ADR-005). Safety is applied as a +per-step logit adjustment during decoding — **after** the grammar mask and +**before** sampling — so it only ever steers over already-legal tokens. **No +safety path touches the network.** + +Depends only on `el-core`. No `unsafe` (`#![forbid(unsafe_code)]`). + +## What it provides + +- **`SafetySteerer`** — the per-step intervention trait: `adjust(recent_tokens) + -> LogitAdjustment` and `mode()`. The runtime calls this each decode step. +- **`LogitAdjustment`** — a sparse, integer (milli-logit) vector subtracted from + target logits. Sparse + integer keeps steering deterministic and + allocation-light. `delta_for(token)`, `l1_norm_milli()` (what the + `LogitsSteered` telemetry event reports), `is_empty()`. +- **`SafetyModeSelector::resolve(requested, device)`** — budget-gates the tier + by device profile: `SecDecoding` (two ~1B models) is downgraded to + `Lightweight` on a `MidRange` device. +- **Steerers per `SafetyMode`:** + - `NoSafety` (`Off`) — a no-op. + - `LightweightFilter` (`Lightweight`) — a training-free blacklist filter + (**fully implemented**). Banned tokens receive `HARD_BAN = -1_000_000` + milli-logits so they cannot be sampled. + - `SecDecodingSteerer` (`SecDecoding`) — base-vs-safety-model steering. + **Scaffolded** follow-up: it requires two ~1B models on Candle, so until the + assets are wired it returns no adjustment while honestly reporting its mode + (so callers can select it without it silently mis-steering). + +## Usage + +```rust +use el_core::{DeviceTarget, SafetyMode}; +use el_safety::{LightweightFilter, SafetyModeSelector, SafetySteerer}; + +// On a mid-range device, SecDecoding is downgraded to a tier it can afford. +let mode = SafetyModeSelector::resolve(SafetyMode::SecDecoding, DeviceTarget::MidRange); +assert_eq!(mode, SafetyMode::Lightweight); + +// Lightweight bans specific token ids outright. +let filter = LightweightFilter::new(vec![42, 99]); +let adj = filter.adjust(&[]); +assert_eq!(adj.delta_for(42), LightweightFilter::HARD_BAN); +assert_eq!(adj.delta_for(7), 0); +``` + +## Place in the workspace + +Re-exported by `el-runtime` (`el_runtime::{SafetySteerer, LogitAdjustment}`) so +callers wire a single type system. The session applies the chosen steerer in the +invariant decode order `grammar mask → safety adjust → sample → commit`. + +## Status + +Partial by design: the `Lightweight` blacklist path is real and tested; +`SecDecoding`/`Csd` model-backed steering is a tracked follow-up that needs +model assets. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-005](../../docs/adr/ADR-005-on-device-only-tiered-decoder-time-safety.md); +see the [Safety](../../docs/ddd/bounded-contexts/05-safety.md) context. diff --git a/crates/el-telemetry/README.md b/crates/el-telemetry/README.md new file mode 100644 index 0000000..e8b7723 --- /dev/null +++ b/crates/el-telemetry/README.md @@ -0,0 +1,61 @@ +# el-telemetry — content-free metrics collector + +A one-way, downstream subscriber that folds content-free +[`el_core::DomainEvent`](../el-core)s into performance snapshots (ADR-007). + +It depends on `el-core` and **nothing depends on it**, and it has no network +channel of its own. Because it can only ever read the numeric and enum fields of +events that are *already* content-free by construction, "no user content in +telemetry" is structural — there is no code path by which a prompt or response +could reach a metric. + +## What it provides + +- **`MetricsCollector`** — subscribes to the domain-event stream and maintains a + running snapshot. Call `observe(&envelope)` per event and `snapshot()` to read. +- **`TelemetrySnapshot`** — a `Copy` struct of counters and gauges: + `prefill_tps`, `decode_tps`, `ttft_ms`, `peak_bytes`, `tokens_generated`, + `compressions`, `safety_violations`. + +`peak_bytes` is a monotonic high-water mark — it only ever rises across the +events it observes. + +## Usage + +```rust +use el_core::{DomainEvent, EventEnvelope, SessionId}; +use el_telemetry::MetricsCollector; + +let mut collector = MetricsCollector::new(); + +collector.observe(&EventEnvelope::new( + SessionId(1), + 0, + DomainEvent::PrefillCompleted { prompt_tokens: 100, kv_len: 100, prefill_tps: 480 }, +)); +collector.observe(&EventEnvelope::new( + SessionId(1), + 1, + DomainEvent::TokenCommitted { kv_len: 101 }, +)); + +let snap = collector.snapshot(); +assert_eq!(snap.prefill_tps, 480); +assert_eq!(snap.tokens_generated, 1); +``` + +## Place in the workspace + +In a full build, the runtime drains `EventEnvelope`s from an `InferenceSession` +(or a cloud `CloudProvider`'s event sink) and feeds them here. This crate is the +read-only end of that pipeline. + +## Status + +Implemented and tested. + +--- + +Part of the [Edge Intelligence](../../README.md) workspace. Realizes +[ADR-007](../../docs/adr/ADR-007-content-free-domain-events-privacy-by-construction-telemetry.md); +see the [Telemetry & Privacy](../../docs/ddd/bounded-contexts/09-telemetry-privacy.md) context. From a23bcca858a8bfc00f1b9e337227945445fac534 Mon Sep 17 00:00:00 2001 From: Tovli Date: Mon, 15 Jun 2026 17:29:36 +0300 Subject: [PATCH 2/2] docs(readme): add architecture/entry-points section and collapsible layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add an "Architecture and entry points" section: a three-layer table (el-ffi/EdgeLlm device facade → el-core/LlmProvider seam → el-runtime orchestrator) and a layered diagram, making the SDK entry point explicit. - Restructure long sections into collapsible
blocks with a Contents TOC for readability (Why, decode pipeline, quick-start extras, chat client, workspace map, ADRs, domain model, roadmap). - Fix the stale "workspace-excluded" label on the el-ffi row — el-ffi and el-cloud are regular workspace members; only el-grammar-llguidance is excluded. Link each crate to its new per-crate README. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 182 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 149 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index d19b01f..d3587c0 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,29 @@ privacy-preserving telemetry, and host bindings for mobile and web runtimes. Local inference is the default path. Frontier and OpenAI-compatible providers exist only behind an explicit opt-in backend. +> **New here?** Jump to [Architecture and entry points](#architecture-and-entry-points) +> to see where the SDK starts and how the crates fit together, then +> [Quick start](#quick-start) to build it. + +## Contents + +- [Why Edge Intelligence?](#why-edge-intelligence) +- [Architecture and entry points](#architecture-and-entry-points) +- [Quick start](#quick-start) +- [Local chat test client](#local-chat-test-client) +- [Workspace map](#workspace-map) +- [Architecture decisions](#architecture-decisions) +- [Domain model](#domain-model) +- [Roadmap](#roadmap) +- [Documentation](#documentation) + ## Why Edge Intelligence? Most mobile LLM stacks start in the cloud and add local features later. This -project starts at the edge: +project starts at the edge. + +
+The six principles that shape every crate | Principle | What it means in the SDK | |-----------|--------------------------| @@ -35,7 +54,61 @@ project starts at the edge: | **Provable provenance** | Model signatures are verified before a session can be constructed. | | **Opt-in egress** | Cloud/frontier providers are a separate adapter and must be wired deliberately by the host app. | -## What It Does +
+ +## Architecture and entry points + +Edge Intelligence is a **hexagonal (ports-and-adapters) workspace, not a single +monolithic crate**. There is no `edge-intelligence` umbrella crate; coherence +comes from three layers, each with a clear entry point: + +| Layer | Crate · symbol | Where it fits | +|-------|----------------|---------------| +| **Device SDK facade** | [`el-ffi`](crates/adapters/el-ffi) · `EdgeLlm` | The composition root shipped to devices. Wires the local engine and opt-in cloud behind one flat API and projects it to React Native (UniFFI/JSI), Flutter (FRB), and Web (wasm-bindgen). **Start here to build an app.** | +| **Rust API seam** | [`el-core`](crates/el-core) · `LlmProvider` | The single trait every backend implements and every Rust consumer calls (ADR-010). **Start here to embed the SDK in Rust.** | +| **Orchestrator** | [`el-runtime`](crates/el-runtime) · `InferenceSession` | Composes provenance, memory, safety, and grammar into the decode loop — the engine the providers drive. | + +```text + Edge device app · Kotlin · Swift · Dart · TypeScript + │ + ▼ + el-ffi · EdgeLlm ← device SDK entry point (composition root) + │ + ▼ + el-core · LlmProvider (trait) ← unified Rust API seam (ADR-010) + │ + ┌────┴───────────────┐ + ▼ ▼ + el-engine-candle el-cloud ← backends: local (default) · opt-in frontier + │ + ▼ + el-runtime · InferenceSession ← orchestrator + │ composes + ▼ + el-memory · el-provenance · el-safety · el-grammar +``` + +
+Which entry point should I use? + +- **Building a mobile or web app** → use [`el-ffi`](crates/adapters/el-ffi)'s + `EdgeLlm`. Construct `EdgeLlm::local(model_uri)` (air-gapped) or + `EdgeLlm::cloud(model, api_key)` (opt-in), then call `ask(...)` / + `ask_stream(...)`. The crate compiles to a native library and a wasm package + and ships generated TypeScript/Dart bindings. +- **Embedding the SDK in Rust** → construct a concrete provider and talk to it + through [`el_core::LlmProvider`](crates/el-core): `el_engine_candle::QwenChatProvider` + for on-device chat, or `el_cloud::CloudProvider` for a frontier backend. + [`apps/el-chat`](apps/el-chat) is a worked example. +- **Extending the SDK** (new engine, grammar, safety, or compression) → + implement the matching port trait from [`el-runtime`](crates/el-runtime) + (`InferenceEngine`, `GrammarMasker`, `SafetySteerer`, `PromptCompressor`), + or implement `LlmProvider` directly for a whole new backend. + +
+ +
+The per-token decode pipeline and SDK seams ```text Host app @@ -56,7 +129,7 @@ load gate -> memory plan -> prefill -> decode loop content-free events and metrics ``` -The current workspace proves the main seams of the SDK: +The workspace proves the main seams of the SDK: - **Runtime orchestration:** `el-runtime` owns the session state machine and enforces the decode order: grammar mask, safety adjustment, sampling, commit. @@ -70,12 +143,15 @@ The current workspace proves the main seams of the SDK: - **Safety:** `el-safety` provides the tiered policy model and lightweight blacklist steering path, with SecDecoding-style model-backed safety tracked as follow-up work. -- **Inference engine seam:** `el-engine-candle` runs a real Candle CPU forward - on a toy in-code model and drives the runtime loop end to end. +- **Inference engine seam:** `el-engine-candle` runs a real Candle CPU forward — + a single-projection seam proof plus a real Qwen2 transformer — and drives the + runtime loop end to end. - **Provider seam:** `el-core::LlmProvider` gives local and frontier backends one host-facing API; `el-cloud` implements the opt-in OpenAI-compatible path. -## Quick Start +
+ +## Quick start Prerequisite: Rust 1.96 or newer, matching the workspace `rust-version`. @@ -84,13 +160,16 @@ cargo build --workspace cargo test --workspace ``` -Build just the dependency-light local core: +
+Build just the dependency-light core, or cross-compile to WASM + +Build and test only the pure-Rust local core (no Candle, no network): ```sh cargo test -p el-core -p el-memory -p el-telemetry -p el-provenance -p el-safety -p el-runtime -p el-grammar ``` -Cross-compile the pure Rust core for WASM: +Cross-compile that core for WASM: ```sh rustup target add wasm32-wasip1 wasm32-unknown-unknown @@ -98,16 +177,26 @@ rustup target add wasm32-wasip1 wasm32-unknown-unknown cargo build --target wasm32-wasip1 -p el-core -p el-memory -p el-telemetry -p el-provenance -p el-safety -p el-runtime -p el-grammar ``` -## Local Chat Test Client +Cross-compile the device bindings (`el-ffi`) for Android / iOS / Web via the +[`Makefile`](Makefile): `make build-android`, `make build-ios`, `make build-wasm`, +or `make bindings` for all three codegen surfaces. + +
+ +## Local chat test client [`apps/el-chat`](apps/el-chat) is an interactive REPL that holds a real -multi-turn conversation with a small LLM running **entirely on-device**. Its -purpose is to exercise the SDK end-to-end, so its only direct dependencies are -SDK crates (`el-core`, `el-engine-candle`) — it contains no inference, model, or -tokenizer code of its own. Every reply flows through the ADR-010 -`LlmProvider` seam: +multi-turn conversation with a small LLM running **entirely on-device**. It +exists to exercise the SDK end-to-end, so its only direct dependencies are SDK +crates (`el-core`, `el-engine-candle`) — it contains no inference, model, or +tokenizer code of its own. -``` +
+Fetch a model and run it + +Every reply flows through the ADR-010 `LlmProvider` seam: + +```text el-chat → el_core::LlmProvider → el_engine_candle::QwenChatProvider (real Qwen2 forward via candle-transformers) → el_runtime::InferenceSession @@ -135,17 +224,23 @@ cargo run -p el-chat -- --system "Be terse." --max-tokens 128 ``` REPL commands: `/reset`, `/system `, `/help`, `/exit`. Other flags: -`--model`, `--tokenizer`, `--system`, `--max-tokens`. The `models/` -directory is git-ignored. Decoding is deterministic (the SDK runtime decodes -greedily), so the same prompt yields the same reply. +`--model`, `--tokenizer`, `--system`, `--max-tokens`. The `models/` directory is +git-ignored. See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full +user guide. + +
+ +## Workspace map -See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide. +Twelve crates plus the chat app. Each crate has its own README (linked below) +covering its public API, a usage example, and the ADRs it realizes. -## Workspace Map +
+Show the full crate table | Crate | Role | Current state | |-------|------|---------------| -| [`crates/el-core`](crates/el-core) | Shared types, IDs, errors, events, provider trait | Implemented and tested | +| [`crates/el-core`](crates/el-core) | Shared types, IDs, errors, events, `LlmProvider` trait | Implemented and tested | | [`crates/el-memory`](crates/el-memory) | Static arena planning and KV-cache descriptors | Implemented and tested | | [`crates/el-telemetry`](crates/el-telemetry) | Content-free event handling and privacy metrics | Implemented and tested | | [`crates/el-provenance`](crates/el-provenance) | Verified model load permits | Implemented and tested | @@ -154,16 +249,26 @@ See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide. | [`crates/el-grammar`](crates/el-grammar) | DFA grammar masking | Implemented and tested | | [`crates/adapters/el-provenance-ed25519`](crates/adapters/el-provenance-ed25519) | Real ED25519 signature verification | Implemented and tested | | [`crates/adapters/el-engine-candle`](crates/adapters/el-engine-candle) | Candle inference adapter: engine-seam proof plus a real Qwen2 transformer engine and chat provider | Implemented; real on-device chat | -| [`crates/adapters/el-cloud`](crates/adapters/el-cloud) | Opt-in OpenAI-compatible provider backend | Implemented as an explicit egress adapter | +| [`crates/adapters/el-cloud`](crates/adapters/el-cloud) | Opt-in OpenAI-compatible provider backend | Implemented; egress opt-in at construction | +| [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | **Device SDK facade (`EdgeLlm`):** Flutter / UniFFI / wasm-bindgen binding surfaces | Implemented and tested; host build is a workspace member, cross-target builds via `make` | | [`crates/adapters/el-grammar-llguidance`](crates/adapters/el-grammar-llguidance) | llguidance JSON-schema token masking | Implemented and tested; workspace-excluded (crates.io deps) | -| [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | Flutter/UniFFI/wasm-bindgen binding surfaces | Implemented and tested (native + wasm32 compile); workspace-excluded (cross toolchains) | | [`apps/el-chat`](apps/el-chat) | Interactive chat test client; SDK-only deps, drives the runtime end-to-end | Implemented; runs real on-device chat | -## Architecture Decisions +Of the adapters, only `el-grammar-llguidance` is excluded from the default +workspace build (it pulls crates.io-only grammar dependencies); `el-cloud` and +`el-ffi` are regular members whose host targets build and test with +`cargo test --workspace`. + +
+ +## Architecture decisions The project is intentionally decision-heavy because mobile LLM runtimes are easy to overfit to one device, model, or provider. The core choices are recorded as -ADRs: +ADRs. + +
+The ten architecture decision records | ADR | Decision | |-----|----------| @@ -180,10 +285,17 @@ ADRs: See the full index in [`docs/adr/README.md`](docs/adr/README.md). -## Domain Model +
+ +## Domain model -The DDD model lives in [`docs/ddd`](docs/ddd/README.md). It breaks the SDK into -nine bounded contexts: +The DDD model lives in [`docs/ddd`](docs/ddd/README.md). The key invariant +across its contexts: **air-gap is the default runtime shape, not a feature flag +sprinkled through the code** — any outbound behavior must be modeled as an +explicit port or adapter. + +
+The nine bounded contexts 1. Inference Runtime 2. Prompt Compression @@ -195,14 +307,15 @@ nine bounded contexts: 8. Model Provenance and Security 9. Telemetry and Privacy -The key invariant across those contexts: **air-gap is the default runtime shape, -not a feature flag sprinkled through the code.** Any outbound behavior must be -modeled as an explicit port or adapter. +
-## What Is Next +## Roadmap The prototype has proven the architectural seams. The next engineering work is -to replace toy proofs with production-grade runtime pieces: +to replace toy proofs with production-grade runtime pieces. + +
+What's next - Production GGUF/safetensors loading and transformer execution in `el-engine-candle`. @@ -214,11 +327,14 @@ to replace toy proofs with production-grade runtime pieces: - On-device benchmarks for time-to-first-token, decode throughput, memory high-water marks, and thermal behavior. +
+ ## Documentation - Product and technical rationale: [`docs/prd.md`](docs/prd.md) - Domain model: [`docs/ddd/README.md`](docs/ddd/README.md) - Architecture decisions: [`docs/adr/README.md`](docs/adr/README.md) +- Per-crate guides: see the [Workspace map](#workspace-map) — every crate links to its own README. Edge Intelligence is still early, but the direction is deliberate: a small, auditable, Rust-native SDK that lets app developers choose local inference first