Tovli · Tovli · Jun 15, 2026 · Jun 15, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,9 @@ members = [
     # Interactive local-LLM chat test client (app, not core) — real Qwen2
     # forward over candle-transformers, drives the ADR-010 LlmProvider seam.
     "apps/el-chat",
+    # Clinical-quality + safety benchmark harness (app, not core) — replays
+    # mental-health benchmark tasks through the same LlmProvider seam.
+    "apps/el-bench",
 ]
 exclude = [
     # Excluded: needs crates.io (llguidance/toktrie) + native tokenizer build deps.

diff --git a/README.md b/README.md
@@ -141,6 +141,44 @@ greedily), so the same prompt yields the same reply.
 
 See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide.
 
+## Benchmarks
+
+The SDK ships two reproducible benchmark harnesses. Both run inference through the
+public `LlmProvider` seam, so they characterize the **SDK's own behavior** and are
+**model-agnostic** — point them at whichever signed model your product loads. The
+model is pluggable and not part of this repo, so no per-model results are published
+here; run the harnesses against your own model to produce them.
+
+**1. Runtime performance / overhead** —
+[`docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md`](docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md)
+
+A phase-level latency breakdown of the local decode path, behind opt-in,
+zero-cost-when-unset instrumentation (`EL_BENCH=1`). The SDK-side conclusion: the
+runtime's own per-token work — decode loop, grammar mask, full-vocab argmax, KV
+commit, and content-free event emission — is **under ~1.2% of decode time**.
+Latency is dominated by model compute plus two orchestration costs the SDK can
+remove, independent of model choice:
+
+- **Batch the prefill** — feed the prompt as one `(1, prompt_len)` forward instead
+  of one forward per prompt token.
+- **Load weights once; reset only the KV cache** — and reuse KV across turns, so a
+  growing conversation is not re-prefilled from scratch each turn.
+- **Stream tokens for real** — emit from inside the decode loop so time-to-first-
+  token is `load + prefill + 1 token`, not full-generation time.
+
+**2. Clinical-quality & safety evaluation** — [`apps/el-bench`](apps/el-bench) ·
+[`benchmarks/README.md`](benchmarks/README.md)
+
+`el-bench` is an SDK-only test client (a sibling to `el-chat`) that replays
+published mental-health benchmarks — CounselBench, MindEval, and the VERA-MH
+suicide-risk safety suite — through the runtime and records transcripts for
+scoring against each benchmark's rubric. Datasets and transcripts are fetched or
+produced locally and are git-ignored (third-party data); only the harness and the
+methodology are committed. Decoding is deterministic, so a given model + task set
+yields identical transcripts — it is designed to run as a **CI safety gate**, so a
+change to the model, the system prompt, or the ADR-005 safety tier can be
+regression-tested against a fixed rubric.
+
 ## Workspace Map
 
 | Crate | Role | Current state |
@@ -158,6 +196,7 @@ See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide.
 | [`crates/adapters/el-grammar-llguidance`](crates/adapters/el-grammar-llguidance) | llguidance JSON-schema token masking | Implemented and tested; workspace-excluded (crates.io deps) |
 | [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | Flutter/UniFFI/wasm-bindgen binding surfaces | Implemented and tested (native + wasm32 compile); workspace-excluded (cross toolchains) |
 | [`apps/el-chat`](apps/el-chat) | Interactive chat test client; SDK-only deps, drives the runtime end-to-end | Implemented; runs real on-device chat |
+| [`apps/el-bench`](apps/el-bench) | Benchmark harness; SDK-only deps, replays quality/safety task sets through the runtime | Implemented; model-agnostic, reproducible |
 
 ## Architecture Decisions
 
@@ -211,8 +250,9 @@ to replace toy proofs with production-grade runtime pieces:
 - Binding codegen and packaging — FRB Dart codegen, uniffi-bindgen-react-native,
   wasm-pack npm publishing (the Rust binding surfaces exist in `el-ffi`).
 - Mobile toolchain validation for Android and iOS `aarch64` targets.
-- On-device benchmarks for time-to-first-token, decode throughput, memory
-  high-water marks, and thermal behavior.
+- On-device benchmarks for memory high-water marks and thermal behavior, and
+  wiring the `el-bench` VERA-MH safety suite into CI as a release gate (latency
+  and quality/safety harnesses already exist — see [Benchmarks](#benchmarks)).
 
 ## Documentation
 

diff --git a/apps/el-bench/Cargo.toml b/apps/el-bench/Cargo.toml
@@ -0,0 +1,29 @@
+# el-bench — clinical-quality + safety benchmark harness for the on-device model.
+#
+# Like el-chat, this is a TEST CLIENT for the SDK: it drives the real on-device
+# inference path (`el_engine_candle::QwenChatProvider` → `el_core::LlmProvider`
+# → `el_runtime::InferenceSession`) and contains no model/tokenizer/inference
+# code of its own. It replays normalized benchmark tasks (CounselBench, MindEval,
+# VERA-MH) through the model and records transcripts for offline judging.
+#
+# serde_json is the only non-SDK dependency: it parses the task files and writes
+# the transcript artifacts. It performs no inference and touches no network.
+[package]
+name = "el-bench"
+description = "Clinical-quality + safety benchmark harness — replays mental-health benchmark tasks through the SDK and records transcripts."
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "el-bench"
+path = "src/main.rs"
+
+[dependencies]
+el-core = { workspace = true }
+el-engine-candle = { workspace = true }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+
+[lints.rust]
+unsafe_code = "forbid"