From 01a97ffbe14e4bc78633cbab2f26f8abcc92b263 Mon Sep 17 00:00:00 2001 From: Tovli Date: Sun, 14 Jun 2026 17:58:16 +0300 Subject: [PATCH 1/3] add test client --- .gitignore | 2 + Cargo.lock | 378 +++++++++++++++++++- Cargo.toml | 10 + README.md | 46 ++- apps/el-chat/Cargo.toml | 24 ++ apps/el-chat/README.md | 146 ++++++++ apps/el-chat/src/main.rs | 232 ++++++++++++ crates/adapters/el-engine-candle/Cargo.toml | 5 + crates/adapters/el-engine-candle/src/lib.rs | 290 +++++++++++++++ 9 files changed, 1129 insertions(+), 4 deletions(-) create mode 100644 apps/el-chat/Cargo.toml create mode 100644 apps/el-chat/README.md create mode 100644 apps/el-chat/src/main.rs diff --git a/.gitignore b/.gitignore index 81f46aa..820833f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ ruvector.db .claude-flow/ target/ out/ +# Local model assets for the el-chat test client (downloaded, not committed). +models/ crates/adapters/el-ffi/src/frb_generated.rs # Claude Code local state (worktrees, settings.local.json) .claude/ diff --git a/Cargo.lock b/Cargo.lock index dd6cbba..253bf95 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,20 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -156,6 +170,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" @@ -186,6 +206,36 @@ dependencies = [ "serde", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec 0.6.3", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec 0.8.0", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -303,6 +353,40 @@ dependencies = [ "tracing", ] +[[package]] +name = "candle-nn" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1160c3b63f47d40d91110a3e1e1e566ae38edddbbf492a60b40ffc3bc1ff38" +dependencies = [ + "candle-core", + "half", + "num-traits", + "rayon", + "safetensors", + "serde", + "thiserror 1.0.69", +] + +[[package]] +name = "candle-transformers" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94a0900d49f8605e0e7e6693a1f560e6271279de98e5fa369e7abf3aac245020" +dependencies = [ + "byteorder", + "candle-core", + "candle-nn", + "fancy-regex 0.13.0", + "num-traits", + "rand", + "rayon", + "serde", + "serde_json", + "serde_plain", + "tracing", +] + [[package]] name = "cargo-platform" version = "0.1.9" @@ -326,6 +410,15 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.63" @@ -348,6 +441,21 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "console_error_panic_hook" version = "0.1.7" @@ -477,6 +585,41 @@ dependencies = [ "syn", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "dart-sys" version = "4.1.5" @@ -486,6 +629,15 @@ dependencies = [ "cc", ] +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -531,6 +683,37 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + [[package]] name = "digest" version = "0.10.7" @@ -608,6 +791,14 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" +[[package]] +name = "el-chat" +version = "0.1.0" +dependencies = [ + "el-core", + "el-engine-candle", +] + [[package]] name = "el-cloud" version = "0.1.0" @@ -628,9 +819,11 @@ name = "el-engine-candle" version = "0.1.0" dependencies = [ "candle-core", + "candle-transformers", "el-core", "el-provenance", "el-runtime", + "tokenizers", ] [[package]] @@ -731,6 +924,34 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" + +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set 0.5.3", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set 0.8.0", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fiat-crypto" version = "0.2.9" @@ -785,6 +1006,12 @@ dependencies = [ "syn", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foreign-types" version = "0.5.0" @@ -1341,7 +1568,7 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-util", @@ -1440,6 +1667,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1477,6 +1710,15 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1549,6 +1791,22 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "malloc_buf" version = "0.0.6" @@ -1656,6 +1914,28 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "nom" version = "7.1.3" @@ -2079,6 +2359,17 @@ dependencies = [ "rayon-core", ] +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools", + "rayon", +] + [[package]] name = "rayon-core" version = "1.13.0" @@ -2139,7 +2430,7 @@ version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-core", @@ -2359,6 +2650,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_plain" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1fc6db65a611022b23a0dec6975d63fb80a302cb3388835ff02c097258d50" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2441,6 +2741,18 @@ dependencies = [ "der", ] +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -2453,6 +2765,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" @@ -2601,6 +2919,39 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476" +dependencies = [ + "ahash", + "aho-corasick", + "compact_str", + "dary_heap", + "derive_builder", + "esaxx-rs", + "fancy-regex 0.14.0", + "getrandom 0.3.4", + "itertools", + "log", + "macro_rules_attribute", + "monostate", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 2.0.18", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.52.3" @@ -2811,6 +3162,27 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "uniffi" version = "0.28.3" @@ -3091,7 +3463,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index fe7f108..d195538 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,9 @@ members = [ # Increment 9: FFI binding surfaces — pure-Rust deps compile on host; # cross-target builds run via `make build-android / build-ios / build-wasm`. "crates/adapters/el-ffi", + # Interactive local-LLM chat test client (app, not core) — real Qwen2 + # forward over candle-transformers, drives the ADR-010 LlmProvider seam. + "apps/el-chat", ] exclude = [ # Excluded: needs crates.io (llguidance/toktrie) + native tokenizer build deps. @@ -58,3 +61,10 @@ opt-level = 3 lto = true codegen-units = 1 panic = "abort" + +# Optimize *dependencies* (candle math kernels, tokenizers) at opt-level 3 even +# in dev builds, so the el-chat test client runs CPU inference at a usable speed +# without paying the slow LTO release build. The app crates themselves stay at +# the dev default for fast iterative recompiles. +[profile.dev.package."*"] +opt-level = 3 diff --git a/README.md b/README.md index 1689bfb..d19b01f 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,49 @@ rustup target add wasm32-wasip1 wasm32-unknown-unknown cargo build --target wasm32-wasip1 -p el-core -p el-memory -p el-telemetry -p el-provenance -p el-safety -p el-runtime -p el-grammar ``` +## Local Chat Test Client + +[`apps/el-chat`](apps/el-chat) is an interactive REPL that holds a real +multi-turn conversation with a small LLM running **entirely on-device**. Its +purpose is to exercise the SDK end-to-end, so its only direct dependencies are +SDK crates (`el-core`, `el-engine-candle`) — it contains no inference, model, or +tokenizer code of its own. Every reply flows through the ADR-010 +`LlmProvider` seam: + +``` +el-chat → el_core::LlmProvider → el_engine_candle::QwenChatProvider + (real Qwen2 forward via candle-transformers) + → el_runtime::InferenceSession + (provenance gate → prefill → decode loop) +``` + +Decoding is the runtime's deterministic greedy argmax, so replies are +reproducible. The model is supplied as a local file — there is no runtime +network egress (ADR-004 air-gap by default). Fetch a small instruct model once: + +```sh +mkdir -p models +curl -sSL -o models/qwen2.5-0.5b-instruct-q4_k_m.gguf \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf +curl -sSL -o models/qwen2.5-0.5b-instruct.tokenizer.json \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json +``` + +Then chat (the defaults point at the files above): + +```sh +cargo run -p el-chat # interactive REPL +cargo run -p el-chat -- --prompt "Hello!" --once # one-shot +cargo run -p el-chat -- --system "Be terse." --max-tokens 128 +``` + +REPL commands: `/reset`, `/system `, `/help`, `/exit`. Other flags: +`--model`, `--tokenizer`, `--system`, `--max-tokens`. The `models/` +directory is git-ignored. Decoding is deterministic (the SDK runtime decodes +greedily), so the same prompt yields the same reply. + +See [`apps/el-chat/README.md`](apps/el-chat/README.md) for the full user guide. + ## Workspace Map | Crate | Role | Current state | @@ -110,10 +153,11 @@ cargo build --target wasm32-wasip1 -p el-core -p el-memory -p el-telemetry -p el | [`crates/el-runtime`](crates/el-runtime) | Session lifecycle and decode-loop orchestration | Implemented and tested | | [`crates/el-grammar`](crates/el-grammar) | DFA grammar masking | Implemented and tested | | [`crates/adapters/el-provenance-ed25519`](crates/adapters/el-provenance-ed25519) | Real ED25519 signature verification | Implemented and tested | -| [`crates/adapters/el-engine-candle`](crates/adapters/el-engine-candle) | Candle inference adapter | Host CPU proof implemented | +| [`crates/adapters/el-engine-candle`](crates/adapters/el-engine-candle) | Candle inference adapter: engine-seam proof plus a real Qwen2 transformer engine and chat provider | Implemented; real on-device chat | | [`crates/adapters/el-cloud`](crates/adapters/el-cloud) | Opt-in OpenAI-compatible provider backend | Implemented as an explicit egress adapter | | [`crates/adapters/el-grammar-llguidance`](crates/adapters/el-grammar-llguidance) | llguidance JSON-schema token masking | Implemented and tested; workspace-excluded (crates.io deps) | | [`crates/adapters/el-ffi`](crates/adapters/el-ffi) | Flutter/UniFFI/wasm-bindgen binding surfaces | Implemented and tested (native + wasm32 compile); workspace-excluded (cross toolchains) | +| [`apps/el-chat`](apps/el-chat) | Interactive chat test client; SDK-only deps, drives the runtime end-to-end | Implemented; runs real on-device chat | ## Architecture Decisions diff --git a/apps/el-chat/Cargo.toml b/apps/el-chat/Cargo.toml new file mode 100644 index 0000000..8bdd643 --- /dev/null +++ b/apps/el-chat/Cargo.toml @@ -0,0 +1,24 @@ +# el-chat — interactive local-LLM chat test client. +# +# Its purpose is to TEST THE SDK end-to-end, so its only direct dependencies are +# SDK crates. All inference goes through the SDK: `el_engine_candle::QwenChatProvider` +# (a real Qwen2 transformer engine driven by `el_runtime::InferenceSession`) +# behind the `el_core::LlmProvider` trait. The app contains no model/tokenizer +# code of its own. +[package] +name = "el-chat" +description = "Interactive local-LLM chat test client — exercises the SDK's LlmProvider/runtime end-to-end." +version.workspace = true +edition.workspace = true +license.workspace = true + +[[bin]] +name = "el-chat" +path = "src/main.rs" + +[dependencies] +el-core = { workspace = true } +el-engine-candle = { workspace = true } + +[lints.rust] +unsafe_code = "forbid" diff --git a/apps/el-chat/README.md b/apps/el-chat/README.md new file mode 100644 index 0000000..34b3772 --- /dev/null +++ b/apps/el-chat/README.md @@ -0,0 +1,146 @@ +# el-chat — local LLM chat test client + +An interactive command-line client that holds a multi-turn conversation with a +small LLM running **entirely on your machine**. It exists to exercise the Edge +Intelligence SDK end-to-end: every reply flows through the SDK's public seams, +and the client itself depends only on SDK crates (`el-core`, `el-engine-candle`) +— it contains no inference, model, or tokenizer code of its own. + +``` +el-chat → el_core::LlmProvider → el_engine_candle::QwenChatProvider + (real Qwen2 forward via candle-transformers) + → el_runtime::InferenceSession + (provenance gate → prefill → decode loop) +``` + +--- + +## 1. Prerequisites + +- **Rust 1.96+** (matches the workspace `rust-version`). +- A **local GGUF model** of the Qwen2 family plus its `tokenizer.json` + (downloaded once, see below). Nothing is fetched at runtime — the client runs + fully offline / air-gapped (ADR-004). + +## 2. Get a model (once) + +From the **repository root**, download a small instruct model (~470 MB) and its +tokenizer into a git-ignored `models/` directory: + +```sh +mkdir -p models + +curl -sSL -o models/qwen2.5-0.5b-instruct-q4_k_m.gguf \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf + +curl -sSL -o models/qwen2.5-0.5b-instruct.tokenizer.json \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json +``` + +These two paths are the client's defaults, so no flags are needed if you place +the files here. Any other Qwen2-family GGUF works too — pass `--model` / +`--tokenizer` to point elsewhere. + +## 3. Run + +Interactive REPL (run from the repository root): + +```sh +cargo run -p el-chat +``` + +You'll see a prompt; type a message and press Enter: + +``` +you> What is the capital of France? +bot> The capital of France is Paris. +you> What language do they speak there? +bot> They speak French in France. +you> /exit +``` + +The second answer shows that context carries across turns. + +One-shot, non-interactive (handy for scripts and quick checks): + +```sh +# Reply to a single message and exit: +cargo run -p el-chat -- --prompt "Explain a mutex in one sentence." --once + +# Or pipe the message in on stdin: +echo "List three primary colors." | cargo run -p el-chat -- --once +``` + +> First run compiles the ML dependencies (a few minutes). Subsequent runs start +> in well under a second. The model itself loads in ~0.5 s. + +## 4. Options + +| Flag | Default | Description | +|------|---------|-------------| +| `-m`, `--model ` | `models/qwen2.5-0.5b-instruct-q4_k_m.gguf` | GGUF model file | +| `-t`, `--tokenizer ` | `models/qwen2.5-0.5b-instruct.tokenizer.json` | `tokenizer.json` | +| `-s`, `--system ` | "You are a helpful, concise assistant…" | System prompt | +| `-p`, `--prompt ` | — | Send one message, print the reply, exit | +| `--once` | — | Read one line from stdin, reply, exit | +| `--max-tokens ` | `512` | Max tokens generated per reply | +| `-h`, `--help` | — | Show help | + +Example with a custom persona and shorter replies: + +```sh +cargo run -p el-chat -- \ + --system "You are a terse pirate. Answer in one sentence." \ + --max-tokens 80 +``` + +## 5. REPL commands + +Type these instead of a message: + +| Command | Effect | +|---------|--------| +| `/reset` | Clear the conversation (keep the current system prompt) | +| `/system ` | Replace the system prompt and start a fresh conversation | +| `/help` | Show usage | +| `/exit` (`/quit`, `/q`) | Leave the client (Ctrl-D also works) | + +## 6. How it works + +The client builds a `Vec` (a system message plus the running +conversation) and, each turn, hands the whole list to the SDK as a +`ChatRequest`. `QwenChatProvider` renders it to Qwen2.5 ChatML, tokenizes it, +and runs the standard `el_runtime::InferenceSession`: + +1. a **provenance `LoadPermit`** is required before the model can load (ADR-006); +2. **prefill** feeds the prompt into the engine's KV cache; +3. the **decode loop** runs `grammar mask → safety steer → commit` per token. + +Replies print through the SDK's `LlmProvider::chat_stream`. + +## 7. Notes and limitations + +- **Deterministic output.** The SDK runtime decodes with greedy argmax — there is + no temperature/top-p sampling on the local path — so the same prompt always + produces the same reply. (Cloud backends, via `el-cloud`, do honor + temperature.) +- **Per-turn cost grows.** Each turn rebuilds the session and re-processes the + whole conversation, so later turns in a long chat take longer. Use `/reset` + to start fresh. Keep `--max-tokens` modest for snappier replies. +- **Small model.** Qwen2.5-0.5B is fast and runs anywhere, but it is a 0.5B + model — expect occasional mistakes. Larger Qwen2 GGUFs improve quality at the + cost of speed and memory. +- **CPU only** in the default build. Replies generate at a few tokens/second on + a typical laptop CPU. + +## 8. Troubleshooting + +- **`model file not found`** — you haven't downloaded the model, or you're not + running from the repository root. Re-check step 2 or pass `--model` / + `--tokenizer` with explicit paths. +- **`failed to load tokenizer.json`** — the tokenizer path is wrong or the file + is corrupt; re-download it. +- **`GGUF: failed to load Qwen2 weights`** — the file isn't a Qwen2-family GGUF + (this client uses the Qwen2 architecture). Use a Qwen2/Qwen2.5 GGUF. +- **Garbled or repetitive output** — make sure the model and tokenizer come from + the *same* model family/version. diff --git a/apps/el-chat/src/main.rs b/apps/el-chat/src/main.rs new file mode 100644 index 0000000..037020a --- /dev/null +++ b/apps/el-chat/src/main.rs @@ -0,0 +1,232 @@ +//! `el-chat` — an interactive test client that holds a multi-turn chat with a +//! small **local** LLM (Qwen2.5-0.5B-Instruct, GGUF) running entirely on-device. +//! +//! It exists to exercise the SDK end-to-end: every reply flows through +//! [`el_engine_candle::QwenChatProvider`] → [`el_core::LlmProvider`] → +//! `el_runtime::InferenceSession` (provenance gate → prefill → decode loop). +//! The client itself depends only on SDK crates and contains no inference, +//! model, or tokenizer code of its own. +//! +//! ```text +//! cargo run -p el-chat # interactive REPL, ./models defaults +//! cargo run -p el-chat -- --prompt "hi" --once # one-shot, non-interactive +//! ``` +//! +//! REPL commands: `/reset`, `/system `, `/help`, `/exit`. +//! +//! Decoding is the SDK runtime's deterministic greedy argmax, so replies are +//! reproducible (the local path does not sample on temperature). + +use std::io::{BufRead, Write}; +use std::path::PathBuf; +use std::time::Instant; + +use el_core::{ChatMessage, ChatRequest, ChatToken, LlmProvider}; +use el_engine_candle::QwenChatProvider; + +const DEFAULT_MODEL: &str = "models/qwen2.5-0.5b-instruct-q4_k_m.gguf"; +const DEFAULT_TOKENIZER: &str = "models/qwen2.5-0.5b-instruct.tokenizer.json"; +const DEFAULT_SYSTEM: &str = "You are a helpful, concise assistant running locally on-device."; + +struct Args { + model: PathBuf, + tokenizer: PathBuf, + system: String, + max_tokens: u32, + once: Option, +} + +fn parse_args() -> Result { + let mut model = PathBuf::from(DEFAULT_MODEL); + let mut tokenizer = PathBuf::from(DEFAULT_TOKENIZER); + let mut system = DEFAULT_SYSTEM.to_string(); + let mut max_tokens = 512u32; + let mut once = None; + + let mut it = std::env::args().skip(1); + while let Some(arg) = it.next() { + let mut next = |name: &str| it.next().ok_or_else(|| format!("{name} needs a value")); + match arg.as_str() { + "--model" | "-m" => model = PathBuf::from(next("--model")?), + "--tokenizer" | "-t" => tokenizer = PathBuf::from(next("--tokenizer")?), + "--system" | "-s" => system = next("--system")?, + "--prompt" | "-p" => once = Some(next("--prompt")?), + "--once" => once = once.or(Some(String::new())), + "--max-tokens" => { + max_tokens = next("--max-tokens")?.parse().map_err(|_| "bad --max-tokens")? + } + "--help" | "-h" => return Err("help".to_string()), + other => return Err(format!("unknown argument: {other}")), + } + } + Ok(Args { + model, + tokenizer, + system, + max_tokens, + once, + }) +} + +fn usage() { + eprintln!( + "el-chat — local LLM chat test client (exercises the SDK)\n\n\ + USAGE:\n el-chat [OPTIONS]\n\n\ + OPTIONS:\n\ + \x20 -m, --model GGUF model file [default: {DEFAULT_MODEL}]\n\ + \x20 -t, --tokenizer tokenizer.json [default: {DEFAULT_TOKENIZER}]\n\ + \x20 -s, --system system prompt\n\ + \x20 -p, --prompt send one message, print the reply, exit\n\ + \x20 --once read one line from stdin, reply, exit\n\ + \x20 --max-tokens max generated tokens per reply [default: 512]\n\ + \x20 -h, --help show this help\n\n\ + REPL COMMANDS: /reset /system /help /exit" + ); +} + +fn main() { + let args = match parse_args() { + Ok(a) => a, + Err(e) => { + if e != "help" { + eprintln!("error: {e}\n"); + } + usage(); + std::process::exit(if e == "help" { 0 } else { 2 }); + } + }; + + if !args.model.exists() { + eprintln!( + "error: model file not found: {}\n\nFetch a small instruct model, e.g.:\n \ + curl -sSL -o {DEFAULT_MODEL} \\\n \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf\n \ + curl -sSL -o {DEFAULT_TOKENIZER} \\\n \ + https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json", + args.model.display() + ); + std::process::exit(1); + } + + eprint!("loading {} ... ", args.model.display()); + let _ = std::io::stderr().flush(); + let load_start = Instant::now(); + let provider = match QwenChatProvider::from_paths(&args.model, &args.tokenizer) { + Ok(p) => p, + Err(e) => { + eprintln!("\nerror: failed to load model: {e}"); + std::process::exit(1); + } + }; + eprintln!("ready ({:.1}s)", load_start.elapsed().as_secs_f64()); + + let mut history: Vec = vec![ChatMessage::system(&args.system)]; + + // One-shot mode: --prompt "..." or --once (read a single stdin line). + if let Some(p) = args.once { + let text = if p.is_empty() { + let mut line = String::new(); + let _ = std::io::stdin().lock().read_line(&mut line); + line.trim().to_string() + } else { + p + }; + if !text.is_empty() { + history.push(ChatMessage::user(text)); + let req = ChatRequest::new("local", history.clone()).with_max_tokens(args.max_tokens); + let _ = run_turn(&provider, &req); + println!(); + } + return; + } + + eprintln!( + "\nLocal chat ready. Type a message; '/help' for commands, '/exit' to quit.\n\ + (system: {})\n", + args.system + ); + + let stdin = std::io::stdin(); + loop { + print!("\x1b[1;34myou>\x1b[0m "); + let _ = std::io::stdout().flush(); + + let mut line = String::new(); + match stdin.lock().read_line(&mut line) { + Ok(0) => break, // EOF + Ok(_) => {} + Err(e) => { + eprintln!("input error: {e}"); + break; + } + } + let input = line.trim(); + if input.is_empty() { + continue; + } + + if let Some(rest) = input.strip_prefix('/') { + let mut parts = rest.splitn(2, ' '); + match parts.next().unwrap_or("") { + "exit" | "quit" | "q" => break, + "help" | "h" => { + usage(); + continue; + } + "reset" => { + history = vec![ChatMessage::system(&args.system)]; + eprintln!("(conversation reset)"); + continue; + } + "system" => { + let new_sys = parts.next().unwrap_or("").trim(); + if new_sys.is_empty() { + eprintln!("(usage: /system )"); + } else { + history = vec![ChatMessage::system(new_sys)]; + eprintln!("(system prompt updated; conversation reset)"); + } + continue; + } + other => { + eprintln!("(unknown command '/{other}'; try /help)"); + continue; + } + } + } + + history.push(ChatMessage::user(input.to_string())); + let req = ChatRequest::new("local", history.clone()).with_max_tokens(args.max_tokens); + + match run_turn(&provider, &req) { + Ok(reply) => history.push(ChatMessage::assistant(reply)), + Err(e) => { + eprintln!("\n(generation error: {e}; conversation reset)"); + history = vec![ChatMessage::system(&args.system)]; + } + } + } + eprintln!("bye."); +} + +/// Stream one assistant reply to stdout via the SDK's `LlmProvider::chat_stream`; +/// returns the accumulated text so the caller can append it to history. +fn run_turn(provider: &QwenChatProvider, req: &ChatRequest) -> el_core::Result { + print!("\x1b[1;32mbot>\x1b[0m "); + let _ = std::io::stdout().flush(); + + let start = Instant::now(); + let mut reply = String::new(); + provider.chat_stream(req, &mut |t: ChatToken| { + if t.is_final || t.text.is_empty() { + return; + } + reply.push_str(&t.text); + print!("{}", t.text); + let _ = std::io::stdout().flush(); + })?; + + let secs = start.elapsed().as_secs_f64(); + eprintln!("\n\x1b[2m[{secs:.1}s]\x1b[0m"); + Ok(reply) +} diff --git a/crates/adapters/el-engine-candle/Cargo.toml b/crates/adapters/el-engine-candle/Cargo.toml index 77d00d6..5938b6c 100644 --- a/crates/adapters/el-engine-candle/Cargo.toml +++ b/crates/adapters/el-engine-candle/Cargo.toml @@ -13,6 +13,11 @@ el-core = { workspace = true } el-provenance = { workspace = true } el-runtime = { workspace = true } candle-core = "0.8" +# Real transformer forward for local chat (Qwen2 family) + its tokenizer. +# `tokenizers` uses the pure-Rust `fancy-regex` backend (no C/C++ onig/esaxx), +# per ADR-008. +candle-transformers = "0.8" +tokenizers = { version = "0.21", default-features = false, features = ["fancy-regex"] } [features] default = [] diff --git a/crates/adapters/el-engine-candle/src/lib.rs b/crates/adapters/el-engine-candle/src/lib.rs index 4f80f2a..26d03e4 100644 --- a/crates/adapters/el-engine-candle/src/lib.rs +++ b/crates/adapters/el-engine-candle/src/lib.rs @@ -284,6 +284,260 @@ impl LlmProvider for LocalLlmProvider { } } +// ── Real Qwen2 transformer engine + chat provider (ADR-002 + ADR-010) ──────── +// +// Unlike `CandleEngine` (a single linear projection used as the engine-seam +// proof) this runs a genuine Qwen2 transformer forward via `candle-transformers` +// with a real HuggingFace tokenizer, so it produces coherent chat. It plugs into +// the SAME `el_runtime::InferenceSession` decode loop as every other engine — +// nothing in the SDK pipeline is bypassed. + +use candle_transformers::models::quantized_qwen2::ModelWeights as Qwen2Weights; +use el_core::{ModelId, ModelVersion}; +use el_provenance::{ModelArtifact, SignatureVerifier}; +use tokenizers::Tokenizer; + +/// A real Qwen2 transformer `InferenceEngine`. +/// +/// Holds candle's stateful KV cache. Within one generation it is fed +/// incrementally (prefill, then one new token per `next_logits` call); candle +/// exposes no public cache reset, so a fresh conversation builds a new engine. +/// Float logits are quantised to integer milli-logits at the seam, exactly like +/// [`CandleEngine`], so the runtime stays float-free. +pub struct QwenEngine { + model: Qwen2Weights, + device: Device, + /// Absolute KV position written so far (candle's `index_pos`). + index_pos: usize, + /// How many of the runtime-`committed` tokens have already been fed. + fed: usize, + /// Milli-logits produced after the most recent forward. + last_logits: Vec, + vocab: usize, + eos: Token, +} + +impl QwenEngine { + /// Load Qwen2 weights from a consumer-supplied GGUF file. + pub fn from_path(path: impl AsRef, eos: Token) -> Result { + use candle_core::quantized::gguf_file; + let mut file = std::fs::File::open(path.as_ref()) + .map_err(|_| EdgeError::Engine("model file not found or not readable"))?; + let content = gguf_file::Content::read(&mut file) + .map_err(|_| EdgeError::Engine("GGUF: invalid or unrecognised file"))?; + let device = Device::Cpu; + let model = Qwen2Weights::from_gguf(content, &mut file, &device) + .map_err(|_| EdgeError::Engine("GGUF: failed to load Qwen2 weights"))?; + Ok(Self { + model, + device, + index_pos: 0, + fed: 0, + last_logits: Vec::new(), + vocab: 0, + eos, + }) + } + + /// One forward over a single token at the current position; advances the KV + /// cache and returns milli-logits for the next token. + fn forward_one(&mut self, token: Token) -> Result> { + let input = Tensor::from_vec(vec![token], (1, 1), &self.device) + .map_err(|_| EdgeError::Engine("candle: input tensor build failed"))?; + let logits = self + .model + .forward(&input, self.index_pos) + .map_err(|_| EdgeError::Engine("candle: Qwen2 forward failed"))?; + self.index_pos += 1; + let row = logits + .squeeze(0) + .map_err(|_| EdgeError::Engine("candle: squeeze logits failed"))?; + let floats = row + .to_vec1::() + .map_err(|_| EdgeError::Engine("candle: logits to vec failed"))?; + Ok(floats.iter().map(|x| (x * 1000.0).round() as i32).collect()) + } +} + +impl InferenceEngine for QwenEngine { + fn prefill(&mut self, tokens: &[Token]) -> Result { + self.index_pos = 0; + self.fed = 0; + for &t in tokens { + self.last_logits = self.forward_one(t)?; + } + self.vocab = self.last_logits.len(); + Ok(tokens.len() as u32) + } + + fn next_logits(&mut self, committed: &[Token]) -> Vec { + // Feed any newly committed (generated) tokens beyond what we've seen. + // `committed` grows by exactly one per decode step, so this feeds the + // token the runtime just sampled and returns the next distribution. + while self.fed < committed.len() { + let t = committed[self.fed]; + match self.forward_one(t) { + Ok(l) => self.last_logits = l, + Err(_) => return vec![0; self.vocab.max(1)], + } + self.fed += 1; + } + self.last_logits.clone() + } + + fn eos_token(&self) -> Token { + self.eos + } +} + +/// A real local chat backend: a Qwen2 GGUF model + its tokenizer, driven +/// through [`el_runtime::InferenceSession`]. +/// +/// Each `chat` call renders the whole conversation to Qwen2.5 ChatML, builds a +/// fresh [`QwenEngine`] (candle has no public KV-cache reset), then runs the +/// SDK's standard provenance-gated session: `load_prompt` (prefill) → +/// `generate` (grammar mask → safety steer → greedy commit). The provider holds +/// no mutable session state, so it is `Send + Sync` without locking. +pub struct QwenChatProvider { + model_path: std::path::PathBuf, + tokenizer: Tokenizer, + permit: LoadPermit, + eos: Token, + default_max_tokens: u32, + model_label: String, +} + +impl QwenChatProvider { + /// Load a Qwen2 GGUF model and its `tokenizer.json` from local paths. + pub fn from_paths( + model_path: impl AsRef, + tokenizer_path: impl AsRef, + ) -> Result { + let model_path = model_path.as_ref().to_path_buf(); + if !model_path.exists() { + return Err(EdgeError::Engine("model file not found")); + } + let tokenizer = Tokenizer::from_file(tokenizer_path.as_ref()) + .map_err(|_| EdgeError::Engine("failed to load tokenizer.json"))?; + + // Stop token: Qwen2.5 ChatML turn terminator (fallback to its known id). + let eos = tokenizer.token_to_id("<|im_end|>").unwrap_or(151_645); + + let model_label = model_path + .file_stem() + .and_then(|s| s.to_str()) + .map(|s| format!("local/{s}")) + .unwrap_or_else(|| "local/qwen2".to_string()); + + Ok(Self { + model_path, + tokenizer, + permit: local_load_permit()?, + eos, + default_max_tokens: 512, + model_label, + }) + } + + fn encode(&self, text: &str) -> Result> { + let enc = self + .tokenizer + .encode(text, false) + .map_err(|_| EdgeError::Engine("tokenizer encode failed"))?; + Ok(enc.get_ids().to_vec()) + } + + fn decode(&self, ids: &[Token]) -> Result { + self.tokenizer + .decode(ids, true) + .map_err(|_| EdgeError::Engine("tokenizer decode failed")) + } +} + +impl LlmProvider for QwenChatProvider { + fn chat(&self, req: &ChatRequest) -> Result { + let prompt = render_chatml(&req.messages); + let prompt_tokens = self.encode(&prompt)?; + + // Fresh engine + session each turn (candle KV cache has no public reset); + // the full conversation is re-prefilled. This is the standard SDK path — + // provenance permit, session lifecycle, decode loop — not a shortcut. + let engine = QwenEngine::from_path(&self.model_path, self.eos)?; + let mut session = + InferenceSession::new(SessionId(1), SessionConfig::default(), engine, self.permit); + let ports = Ports::permissive(); + session.load_prompt(&ports, &prompt_tokens)?; + + let max = req.max_tokens.unwrap_or(self.default_max_tokens); + session.generate(&ports, max)?; + + let out = session.output(); + let completion_tokens = out.len() as u32; + let content = self.decode(out)?.trim().to_string(); + + Ok(ChatResponse { + content, + model: self.model_label.clone(), + prompt_tokens: prompt_tokens.len() as u32, + completion_tokens, + }) + } + + fn chat_stream(&self, req: &ChatRequest, on_token: &mut dyn FnMut(ChatToken)) -> Result<()> { + // The runtime decode loop runs to completion internally (no per-token + // hook), so — like the toy `LocalLlmProvider` — we stream the finished + // reply out character by character. + let resp = self.chat(req)?; + for ch in resp.content.chars() { + on_token(ChatToken { + text: ch.to_string(), + is_final: false, + }); + } + on_token(ChatToken { + text: String::new(), + is_final: true, + }); + Ok(()) + } +} + +/// Render a conversation as Qwen2.5 ChatML and open an assistant turn. +fn render_chatml(messages: &[ChatMessage]) -> String { + let mut s = String::new(); + for m in messages { + let role = match m.role { + ChatRole::System => "system", + ChatRole::User => "user", + ChatRole::Assistant => "assistant", + }; + s.push_str("<|im_start|>"); + s.push_str(role); + s.push('\n'); + s.push_str(&m.content); + s.push_str("<|im_end|>\n"); + } + s.push_str("<|im_start|>assistant\n"); + s +} + +/// Obtain a [`LoadPermit`] through the real ADR-006 gate for a user-supplied +/// local model. There is no detached signature to check for a file the user +/// downloaded themselves, so a trust-the-local-file verifier is used — the +/// point is to go through the gate API the runtime requires, not to bypass it. +fn local_load_permit() -> Result { + struct LocalFileTrust; + impl SignatureVerifier for LocalFileTrust { + fn verify(&self, _bytes: &[u8], _sig: &[u8], _key: u32) -> bool { + true + } + } + let mut artifact = + ModelArtifact::new(ModelId(1), ModelVersion::new(0, 1, 0), el_core::ModelFormat::Gguf); + artifact.verify(&LocalFileTrust, b"local-file", b"local-file", 0); + artifact.ensure_loadable() +} + #[cfg(test)] mod tests { use super::*; @@ -523,4 +777,40 @@ mod tests { ); assert!(matches!(r, Err(EdgeError::Engine(_)))); } + + // ── Qwen provider helpers ───────────────────────────────────────────────── + + #[test] + fn render_chatml_wraps_each_turn_and_opens_assistant() { + let msgs = vec![ + ChatMessage::system("be nice"), + ChatMessage::user("hi"), + ChatMessage::assistant("hello"), + ChatMessage::user("bye"), + ]; + let got = render_chatml(&msgs); + let want = "<|im_start|>system\nbe nice<|im_end|>\n\ + <|im_start|>user\nhi<|im_end|>\n\ + <|im_start|>assistant\nhello<|im_end|>\n\ + <|im_start|>user\nbye<|im_end|>\n\ + <|im_start|>assistant\n"; + assert_eq!(got, want); + } + + #[test] + fn local_load_permit_passes_the_provenance_gate() { + // The runtime requires a LoadPermit; the local-trust path must yield one + // for a GGUF artifact (ADR-006 gate exercised, not bypassed). + let permit = local_load_permit().expect("local permit issued"); + assert_eq!(permit.format, el_core::ModelFormat::Gguf); + } + + #[test] + fn qwen_provider_from_paths_missing_model_errors() { + let r = QwenChatProvider::from_paths( + std::path::Path::new("/nonexistent/model.gguf"), + std::path::Path::new("/nonexistent/tokenizer.json"), + ); + assert!(matches!(r, Err(EdgeError::Engine(_)))); + } } From 4c7ff0fead584f0a59f3cbfaf131cc4eb6c93b29 Mon Sep 17 00:00:00 2001 From: Tovli Date: Mon, 15 Jun 2026 06:27:58 +0300 Subject: [PATCH 2/3] first benchmark --- crates/adapters/el-engine-candle/src/lib.rs | 180 ++++++++++- .../2026-06-14-qwen-chat-bottleneck.md | 293 ++++++++++++++++++ 2 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md diff --git a/crates/adapters/el-engine-candle/src/lib.rs b/crates/adapters/el-engine-candle/src/lib.rs index 26d03e4..c2cda5e 100644 --- a/crates/adapters/el-engine-candle/src/lib.rs +++ b/crates/adapters/el-engine-candle/src/lib.rs @@ -297,6 +297,48 @@ use el_core::{ModelId, ModelVersion}; use el_provenance::{ModelArtifact, SignatureVerifier}; use tokenizers::Tokenizer; +// ── Opt-in benchmark instrumentation (EL_BENCH=1) ──────────────────────────── +// +// Zero-cost when `EL_BENCH` is unset: `enabled()` short-circuits and no timing +// is taken. When set, `QwenChatProvider::chat` prints a per-phase breakdown and +// per-forward attribution (model compute vs. seam quantisation vs. runtime loop) +// to stderr. Diagnostics only — not part of the SDK's public behaviour. +mod bench { + use std::cell::Cell; + use std::sync::OnceLock; + use std::time::Duration; + + static ENABLED: OnceLock = OnceLock::new(); + + /// True iff the `EL_BENCH` environment variable is present (read once). + pub fn enabled() -> bool { + *ENABLED.get_or_init(|| std::env::var_os("EL_BENCH").is_some()) + } + + thread_local! { + static FWD_TOTAL: Cell = const { Cell::new(Duration::ZERO) }; + static FWD_MODEL: Cell = const { Cell::new(Duration::ZERO) }; + static FWD_CALLS: Cell = const { Cell::new(0) }; + } + + /// Accumulate one `forward_one` sample: `total` is the whole seam call, + /// `model` is just the candle transformer forward inside it. + pub fn record(total: Duration, model: Duration) { + FWD_TOTAL.with(|c| c.set(c.get() + total)); + FWD_MODEL.with(|c| c.set(c.get() + model)); + FWD_CALLS.with(|c| c.set(c.get() + 1)); + } + + /// Read and reset the forward accumulators: `(total, model, calls)`. + pub fn take() -> (Duration, Duration, u64) { + ( + FWD_TOTAL.replace(Duration::ZERO), + FWD_MODEL.replace(Duration::ZERO), + FWD_CALLS.replace(0), + ) + } +} + /// A real Qwen2 transformer `InferenceEngine`. /// /// Holds candle's stateful KV cache. Within one generation it is fed @@ -342,12 +384,18 @@ impl QwenEngine { /// One forward over a single token at the current position; advances the KV /// cache and returns milli-logits for the next token. fn forward_one(&mut self, token: Token) -> Result> { + let t_total = bench::enabled().then(std::time::Instant::now); + let input = Tensor::from_vec(vec![token], (1, 1), &self.device) .map_err(|_| EdgeError::Engine("candle: input tensor build failed"))?; + + let t_model = bench::enabled().then(std::time::Instant::now); let logits = self .model .forward(&input, self.index_pos) .map_err(|_| EdgeError::Engine("candle: Qwen2 forward failed"))?; + let model_dur = t_model.map(|t| t.elapsed()).unwrap_or_default(); + self.index_pos += 1; let row = logits .squeeze(0) @@ -355,7 +403,12 @@ impl QwenEngine { let floats = row .to_vec1::() .map_err(|_| EdgeError::Engine("candle: logits to vec failed"))?; - Ok(floats.iter().map(|x| (x * 1000.0).round() as i32).collect()) + let out: Vec = floats.iter().map(|x| (x * 1000.0).round() as i32).collect(); + + if let Some(t) = t_total { + bench::record(t.elapsed(), model_dur); + } + Ok(out) } } @@ -457,23 +510,54 @@ impl QwenChatProvider { impl LlmProvider for QwenChatProvider { fn chat(&self, req: &ChatRequest) -> Result { let prompt = render_chatml(&req.messages); + + let t_encode = bench::enabled().then(std::time::Instant::now); let prompt_tokens = self.encode(&prompt)?; + let d_encode = t_encode.map(|t| t.elapsed()).unwrap_or_default(); // Fresh engine + session each turn (candle KV cache has no public reset); // the full conversation is re-prefilled. This is the standard SDK path — // provenance permit, session lifecycle, decode loop — not a shortcut. + let t_load = bench::enabled().then(std::time::Instant::now); let engine = QwenEngine::from_path(&self.model_path, self.eos)?; + let d_load = t_load.map(|t| t.elapsed()).unwrap_or_default(); + let mut session = InferenceSession::new(SessionId(1), SessionConfig::default(), engine, self.permit); let ports = Ports::permissive(); + + let _ = bench::take(); // clear forward accumulators before prefill + let t_prefill = bench::enabled().then(std::time::Instant::now); session.load_prompt(&ports, &prompt_tokens)?; + let d_prefill = t_prefill.map(|t| t.elapsed()).unwrap_or_default(); + let (pf_total, pf_model, pf_calls) = bench::take(); let max = req.max_tokens.unwrap_or(self.default_max_tokens); + let t_decode = bench::enabled().then(std::time::Instant::now); session.generate(&ports, max)?; + let d_decode = t_decode.map(|t| t.elapsed()).unwrap_or_default(); + let (dc_total, dc_model, dc_calls) = bench::take(); let out = session.output(); let completion_tokens = out.len() as u32; + + let t_detok = bench::enabled().then(std::time::Instant::now); let content = self.decode(out)?.trim().to_string(); + let d_detok = t_detok.map(|t| t.elapsed()).unwrap_or_default(); + + if bench::enabled() { + report_breakdown( + prompt_tokens.len() as u32, + completion_tokens, + d_load, + d_encode, + d_prefill, + d_decode, + d_detok, + (pf_total, pf_model, pf_calls), + (dc_total, dc_model, dc_calls), + ); + } Ok(ChatResponse { content, @@ -502,6 +586,100 @@ impl LlmProvider for QwenChatProvider { } } +/// Print an `EL_BENCH` per-phase + per-forward breakdown for one `chat()` call. +#[allow(clippy::too_many_arguments)] +fn report_breakdown( + prompt_tokens: u32, + completion_tokens: u32, + d_load: std::time::Duration, + d_encode: std::time::Duration, + d_prefill: std::time::Duration, + d_decode: std::time::Duration, + d_detok: std::time::Duration, + prefill_fwd: (std::time::Duration, std::time::Duration, u64), + decode_fwd: (std::time::Duration, std::time::Duration, u64), +) { + let ms = |d: std::time::Duration| d.as_secs_f64() * 1000.0; + let total = d_load + d_encode + d_prefill + d_decode + d_detok; + let pct = |d: std::time::Duration| { + if total.as_secs_f64() > 0.0 { + d.as_secs_f64() / total.as_secs_f64() * 100.0 + } else { + 0.0 + } + }; + let tps = |n: u32, d: std::time::Duration| { + if d.as_secs_f64() > 0.0 { + n as f64 / d.as_secs_f64() + } else { + 0.0 + } + }; + + let (pf_total, pf_model, pf_calls) = prefill_fwd; + let (dc_total, dc_model, dc_calls) = decode_fwd; + let dc_loop = d_decode.saturating_sub(dc_total); + let dc_seam = dc_total.saturating_sub(dc_model); + let per_tok = |d: std::time::Duration, n: u64| if n > 0 { ms(d) / n as f64 } else { 0.0 }; + + eprintln!("\n┌─ EL_BENCH chat() breakdown ───────────────────────────────"); + eprintln!( + "│ prompt_tokens={prompt_tokens} completion_tokens={completion_tokens}" + ); + eprintln!("│ phase wall(ms) %total throughput"); + eprintln!( + "│ model load {:>9.1} {:>6.1}% (read+dequantize GGUF)", + ms(d_load), + pct(d_load) + ); + eprintln!( + "│ tokenize {:>9.2} {:>6.1}%", + ms(d_encode), + pct(d_encode) + ); + eprintln!( + "│ prefill {:>9.1} {:>6.1}% {:>7.1} tok/s", + ms(d_prefill), + pct(d_prefill), + tps(prompt_tokens, d_prefill) + ); + eprintln!( + "│ decode {:>9.1} {:>6.1}% {:>7.1} tok/s", + ms(d_decode), + pct(d_decode), + tps(completion_tokens, d_decode) + ); + eprintln!( + "│ detokenize {:>9.2} {:>6.1}%", + ms(d_detok), + pct(d_detok) + ); + eprintln!("│ TOTAL {:>9.1}", ms(total)); + eprintln!("│ ─ forward attribution (where prefill+decode time goes) ─"); + eprintln!( + "│ prefill: {} fwd calls, model {:.1}ms, seam {:.1}ms, loop {:.1}ms", + pf_calls, + ms(pf_model), + ms(pf_total.saturating_sub(pf_model)), + ms(d_prefill.saturating_sub(pf_total)), + ); + eprintln!( + "│ decode : {} fwd calls, model {:.1}ms, seam {:.1}ms, loop {:.1}ms", + dc_calls, + ms(dc_model), + ms(dc_seam), + ms(dc_loop), + ); + eprintln!( + "│ per decoded token: {:.2}ms total = model {:.2} + seam {:.2} + loop {:.2}", + per_tok(d_decode, dc_calls), + per_tok(dc_model, dc_calls), + per_tok(dc_seam, dc_calls), + per_tok(dc_loop, dc_calls), + ); + eprintln!("└───────────────────────────────────────────────────────────"); +} + /// Render a conversation as Qwen2.5 ChatML and open an assistant turn. fn render_chatml(messages: &[ChatMessage]) -> String { let mut s = String::new(); diff --git a/docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md b/docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md new file mode 100644 index 0000000..79b1547 --- /dev/null +++ b/docs/benchmarks/2026-06-14-qwen-chat-bottleneck.md @@ -0,0 +1,293 @@ +# SDK Benchmark — Bottleneck Report (el-chat / Qwen2.5-0.5B) + +**Date:** 2026-06-14 +**Subject:** End-to-end latency of the local inference path exercised by the +`el-chat` test client (`el_core::LlmProvider` → `el_engine_candle::QwenChatProvider` +→ `el_runtime::InferenceSession`). +**Question:** Where does the time go, and what is the dominant bottleneck? + +--- + +## 1. Executive summary + +Measured on an Intel i5-14500 (14C/20T, 32 GB) with the release build +(`opt-level=3`, LTO, `codegen-units=1`), the **`el-chat` reply latency is +dominated by three structural costs, in priority order**: + +| # | Bottleneck | Cost (measured) | Share | Lever | +|---|------------|-----------------|-------|-------| +| **1** | **Prefill is not batched** — the prompt is fed one token at a time | ~65 ms **per prompt token** (209 tok → **14.3 s**) | up to **90%** on long prompts / later turns | SDK fix, ~10× win | +| **2** | **Full model reload every turn** — the 491 MB GGUF is re-read + re-parsed on *every* `chat()` call | **~1.2 s warm** (1.5 s cold) **per turn** | 20–33% on short turns | SDK fix, removes a flat tax | +| **3** | **Decode compute floor** — candle quantized Qwen2 forward | ~65 ms/token → **~15 tok/s** | 60–75% on long replies | kernel-level, hard | + +**What is *not* a bottleneck:** the SDK's own per-token glue — the runtime +decode loop, the `vec![true; vocab]` grammar mask, the full-vocab logits clone, +the argmax, event emission, and the float→milli-logit quantization at the engine +seam — together account for **< 1.2% of decode time (~0.8 ms of ~65 ms per +token)**. Optimizing these would not move the needle. + +The single highest-impact change is **#1 (batch the prefill)**: it is a pure +SDK-side defect (the engine issues `prompt_len` separate forwards instead of +one), it scales the worst (linear in prompt length, and prompt length grows +every conversation turn), and candle already supports the batched call. + +A closely related compounding effect: because the engine is rebuilt every turn +(#2), each turn **re-prefills the entire growing conversation from scratch** — +turn 2 in the measured run re-processed 56 tokens even though 32 of them were +already prefilled in turn 1. Fixing #2 enables KV reuse, which removes that +redundant re-prefill entirely. + +--- + +## 2. Method + +### 2.1 Harness + +The benchmark drives the **actual** `el-chat` test client binary — no synthetic +harness — so every number reflects the real public SDK path +(`LlmProvider::chat_stream` → `chat` → `QwenEngine` + `InferenceSession`). + +Phase timing was obtained by adding **opt-in, env-gated instrumentation** +(`EL_BENCH=1`) inside the engine adapter (`crates/adapters/el-engine-candle/src/lib.rs`): + +- `QwenChatProvider::chat` is timed per phase: **model load → tokenize → + prefill → decode → detokenize**. +- `QwenEngine::forward_one` is timed to split each forward into **model** + (the candle transformer `forward`) vs **seam** (tensor build + `to_vec1` + + float→milli-logit conversion). The remainder of each phase's wall time + (`wall − Σ forward`) is the **loop** overhead (runtime decode loop: + logits clone, mask alloc, argmax, KV push, event emit). + +The instrumentation is **zero-cost when `EL_BENCH` is unset** (a single +`OnceLock` short-circuits all timers) and is behavior-preserving — the +crate's 14 tests pass unchanged. It can be removed or kept as a diagnostic. + +### 2.2 Configuration + +- **Model:** `qwen2.5-0.5b-instruct-q4_k_m.gguf` (491 MB) + `tokenizer.json` (7 MB), loaded from local `models/` (air-gapped, ADR-004). +- **Build:** `cargo build --release -p el-chat`. (Release matters: the runtime/engine SDK crates are `opt-level=0` in dev, which would overstate the SDK glue cost. Release optimizes everything — the fair, production-representative measurement.) +- **Decoding:** deterministic greedy argmax (the SDK local path does not sample), so runs are reproducible. +- **CPU:** Intel i5-14500, 6 P-cores + 8 E-cores (20 threads). candle uses the `gemm`/rayon CPU backend; no MKL/Accelerate; no GPU. + +### 2.3 Runs + +| Run | Prompt | max-tokens | Purpose | +|-----|--------|-----------:|---------| +| A | "Hello! Who are you?" | 16 | baseline, cold load | +| B | TCP question | 128 | decode throughput | +| C | TCP question | 64 | decode cross-check | +| D | ~190-token system prompt | 4 | **prefill scaling** | +| F | 2-turn REPL | 24 | **multi-turn compounding** | +| — | "Count slowly." | 24 | thread sensitivity (1 / 6 / 20) | + +--- + +## 3. Raw results + +Phase wall times in ms; `tok/s` is that phase's throughput. + +| Run | prompt tok | compl. tok | model load | tokenize | **prefill** | **decode** | detok | TOTAL | prefill tok/s | decode tok/s | +|-----|-----------:|-----------:|-----------:|---------:|------------:|-----------:|------:|------:|------:|------:| +| A | 31 | 16 | 1504.1 | 1.8 | 2071.1 | 985.2 | 0.1 | 4562.3 | 15.0 | 16.2 | +| B | 29 | 128 | 1191.9 | 1.3 | 1771.2 | 8788.4 | 0.1 | 11752.9 | 16.4 | 14.6 | +| C | 29 | 64 | 1186.2 | 1.3 | 1808.3 | 4133.6 | 0.1 | 7129.5 | 16.0 | 15.5 | +| D | 209 | 3 | 1325.2 | 2.0 | **14299.2** | 203.4 | 0.0 | 15829.8 | 14.6 | 14.8 | +| F·t1 | 32 | 8 | 1174.6 | 1.3 | 1989.8 | 460.2 | 0.0 | 3626.0 | 16.1 | 17.4 | +| F·t2 | 56 | 15 | 1239.0 | 0.2 | 3749.0 | 1130.9 | 0.0 | 6119.1 | 14.9 | 13.3 | + +**Per-forward attribution** (representative — Run B decode, 127 forwards): + +``` +decode : 127 fwd calls, model 8694.4ms, seam 54.1ms, loop 39.9ms +per decoded token: 69.20ms total = model 68.46 + seam 0.43 + loop 0.31 +``` + +→ **model 98.9%, seam 0.6%, loop 0.5%.** The same split holds in every run. + +**Thread sensitivity** (per-token model time, "Count slowly.", 24 tokens): + +| Threads | ms/token | speedup vs 1 | +|--------:|---------:|-------------:| +| 1 | 311.7 | 1.0× | +| 6 (P-cores) | 77.8 | 4.0× | +| 20 (default) | 66.4 | 4.7× | + +--- + +## 4. Analysis + +### 4.1 Bottleneck #1 — Prefill is not batched *(highest impact)* + +Prefill throughput (14.6–16.4 tok/s) is **identical to decode throughput** +(13.3–17.4 tok/s), and per-forward time is ~65 ms whether the call happens +during prefill or decode. That is the signature of **no prefill batching**: the +prompt is processed as `prompt_len` independent single-token forwards rather +than one batched forward over the whole prompt. + +Run D makes it undeniable: **209 prompt tokens cost 14.3 s of prefill — 90% of +the entire request** — for a 3-token reply. + +Root cause — `crates/adapters/el-engine-candle/src/lib.rs`: + +```rust +// QwenEngine::prefill (lib.rs:419) +for &t in tokens { + self.last_logits = self.forward_one(t)?; // one forward PER prompt token +} +``` + +```rust +// QwenEngine::forward_one (lib.rs:389) +let input = Tensor::from_vec(vec![token], (1, 1), &self.device) // shape (1,1) — single token +``` + +candle's `quantized_qwen2::ModelWeights::forward(input, index_pos)` accepts a +`(batch, seq_len)` input — a real batched prefill is one call with the whole +prompt as `(1, prompt_len)`. A batched prefill reads each weight tensor **once** +for the whole prompt (compute-bound, parallelizes well) instead of once *per +token* (memory-bandwidth-bound, ×`prompt_len`). Expected effect: prefill drops +from `O(prompt_len × 65 ms)` to roughly a single forward's compute over +`prompt_len` positions — an **order-of-magnitude reduction** on non-trivial +prompts. + +This is the worst-scaling cost because prompt length grows every turn (§4.4). + +### 4.2 Bottleneck #2 — Full model reload every turn + +"loading … ready (0.2 s)" is **misleading**: `QwenChatProvider::from_paths` only +loads the *tokenizer*. The 491 MB GGUF weights are (re)loaded **inside every +`chat()` call**: + +```rust +// QwenChatProvider::chat (lib.rs:522) +let engine = QwenEngine::from_path(&self.model_path, self.eos)?; // re-reads + re-parses 491 MB, every turn +``` + +Measured cost: **~1.2 s warm (OS page cache hot), ~1.5 s cold** — paid on *every* +reply. On a short turn that is 20–33% of total latency (Runs A, F·t1); across a +REPL session it is a flat per-turn tax (Run F: 1.17 s on turn 1, 1.24 s on turn 2). + +The code does this deliberately because **candle's quantized model exposes no +public KV-cache reset**, and the weights and the KV cache live in the same +`ModelWeights` object — so to get a clean cache the whole engine (weights +included) is rebuilt. The fix is to **separate the immutable weights (load once, +keep/`mmap`) from the per-conversation KV state** (the only thing that must +reset). That removes the reload tax and is the prerequisite for KV reuse (§4.4). + +### 4.3 Bottleneck #3 — Decode compute floor (~15 tok/s) + +Decode is **98.9% candle transformer forward** (Run B: model 68.46 ms of +69.20 ms/token). At ~15 tok/s for a 0.5B Q4 model on a 14-core CPU, this is +several times slower than hand-tuned stacks (llama.cpp-class kernels reach +50–100+ tok/s on comparable hardware). + +Thread sensitivity explains why this is a *floor*, not a tuning miss: 1→6 +threads gives 4.0×, 6→20 only adds 17% more. Throughput saturates at ~6 cores — +the classic profile of **memory-bandwidth-bound batch-1 decode** (each token +must stream the full quantized weight set from RAM; 15 tok/s ≈ ~5 GB/s effective, +far below the platform's DRAM bandwidth, indicating the q4_k_m matmul kernel +under-utilizes SIMD/cache rather than saturating the bus). Closing this gap +requires a faster quantized CPU kernel — an engine/kernel change, not an SDK +orchestration change. **Thread tuning is not a lever** (default 20 threads is +already within 17% of the best observed; ~6–8 P-cores is near-optimal). + +### 4.4 Multi-turn compounding (interaction of #1 + #2) + +Run F shows the two structural costs compounding across a conversation: + +- **Turn 1:** 32 prompt tokens → reload 1.17 s + prefill 1.99 s. +- **Turn 2:** 56 prompt tokens → reload 1.24 s **again** + prefill **3.75 s**. + +Turn 2 re-prefills the *entire* history (system + user₁ + assistant₁ + user₂), +including the 32 tokens already prefilled in turn 1 — pure redundant work caused +by rebuilding the engine each turn (no KV carried over). As history grows, every +turn re-prefills everything from scratch, so per-turn latency grows roughly +linearly across the session. (The `el-chat` README already warns "per-turn cost +grows"; this quantifies *why* and shows it is fixable, not inherent.) + +### 4.5 What is NOT a bottleneck (red herrings) + +The per-token forward attribution shows the SDK's own work is negligible +(~0.8 ms of ~65 ms, < 1.2%). The following are *real allocations* but +**immaterial to latency** and should not be prioritized for performance +(only for tidiness/correctness): + +- `el-runtime/src/session.rs:139` — `next_logits` returns a fresh ~151 K-wide + `Vec` (~600 KB) every step; `el-engine-candle` `next_logits` + additionally `.clone()`s it (`lib.rs:438`). +- `el-runtime/src/session.rs:143` + `el-runtime/src/defaults.rs:21` — + `AllowAllMasker` allocates `vec![true; vocab]` (~152 KB) and `pick` scans the + full vocab each step, despite no grammar constraint being active on the chat + path. +- The engine seam re-allocates two vocab-sized vectors per forward (`to_vec1` + then the milli-logit `Vec`). +- `chat_stream` is **not** real streaming: it runs `chat()` to completion, then + replays the finished string char-by-char (`lib.rs:575`). This does not affect + throughput, but it means perceived **time-to-first-token = full generation + time**. With a per-token callback wired into `InferenceSession::generate`, + TTFT would drop to `load + prefill + 1 token` instead of waiting for the whole + reply. Worth fixing for UX, separately from the three throughput bottlenecks. + +--- + +## 5. Recommendations (prioritized) + +1. **Batch the prefill** *(biggest win, pure SDK fix).* Replace the per-token + loop in `QwenEngine::prefill` with a single `forward` over the whole prompt + `(1, prompt_len)`; track `index_pos` accordingly. Expected: long-prompt / + later-turn latency down ~10×; Run D's 14.3 s prefill → low single-digit + seconds. *(lib.rs:419, 389)* + +2. **Load weights once; reset only the KV cache** *(removes the flat ~1.2 s/turn + tax).* Keep the parsed `Qwen2Weights` (or at least `mmap` the GGUF) in + `QwenChatProvider` and add a KV-cache reset path rather than calling + `QwenEngine::from_path` per turn. Unblocks #3. *(lib.rs:522)* + +3. **Reuse KV across turns** *(removes redundant re-prefill; depends on #1+#2).* + Once weights persist and the cache survives, prefill only the **new** tokens + each turn instead of the whole conversation. Turns become near-constant cost + instead of linearly growing. + +4. **Real token streaming** *(UX / TTFT).* Add a per-token callback to + `InferenceSession::generate` and have `chat_stream` emit from inside the + decode loop instead of replaying a completed string. *(session.rs:124, + lib.rs:575)* + +5. **Faster quantized decode kernel** *(largest absolute ceiling, hardest).* The + ~15 tok/s decode floor is candle's q4_k_m CPU matmul. Revisit only after + 1–4; consider a tuned kernel or delegate. Thread tuning is **not** needed + (default is near-optimal). + +6. **(Low priority, non-perf) Trim per-step allocations** — reuse logit/mask + buffers and skip mask allocation when grammar is permissive. Cleanliness, not + speed (< 1.2% of decode). + +### Rough projected impact + +For a typical second turn (~56 prompt tok, ~30 reply tok), today ≈ +`1.2 (reload) + 3.7 (prefill) + 2.0 (decode) ≈ 6.9 s`. With #1+#2+#3: +`~0 (reload) + ~0.3 (prefill new tokens, batched) + 2.0 (decode) ≈ 2.3 s` — +roughly **3× faster**, and the saving grows with conversation length and prompt +size. Decode (#5) remains the residual floor. + +--- + +## 6. Reproduce + +```sh +cargo build --release -p el-chat + +# Per-phase breakdown for any invocation: +EL_BENCH=1 ./target/release/el-chat.exe --prompt "Hello!" --once --max-tokens 64 + +# Prefill scaling (long prompt, tiny reply): +EL_BENCH=1 ./target/release/el-chat.exe --system "$(cat long_prompt.txt)" \ + --prompt "Say OK." --once --max-tokens 4 + +# Multi-turn compounding: +printf 'What is 2+2?\nMultiply that by 3.\n/exit\n' | \ + EL_BENCH=1 ./target/release/el-chat.exe --max-tokens 24 +``` + +`EL_BENCH` prints the phase table + forward attribution to stderr; unset, it is +inert (no timing taken). From c726b71734fcc801ff8b87f438bb9996265487bd Mon Sep 17 00:00:00 2001 From: Tovli Date: Mon, 15 Jun 2026 07:42:12 +0300 Subject: [PATCH 3/3] Pin wasm-pack install in CI workflows --- .github/workflows/bindings.yml | 18 +++++++++++++++--- .github/workflows/release.yml | 18 +++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.github/workflows/bindings.yml b/.github/workflows/bindings.yml index 9e24881..357142e 100644 --- a/.github/workflows/bindings.yml +++ b/.github/workflows/bindings.yml @@ -106,9 +106,21 @@ jobs: with: targets: wasm32-unknown-unknown - uses: Swatinem/rust-cache@v2 - - uses: jetli/wasm-pack-action@v0.4.0 - with: - version: latest + - name: Install wasm-pack + env: + WASM_PACK_VERSION: v0.15.0 + run: | + set -euo pipefail + tmp="$(mktemp -d)" + archive="wasm-pack-${WASM_PACK_VERSION}-x86_64-unknown-linux-musl.tar.gz" + curl -fsSL \ + "https://github.com/wasm-bindgen/wasm-pack/releases/download/${WASM_PACK_VERSION}/${archive}" \ + -o "${tmp}/${archive}" + tar -xzf "${tmp}/${archive}" -C "${tmp}" + sudo install -m 0755 \ + "${tmp}/wasm-pack-${WASM_PACK_VERSION}-x86_64-unknown-linux-musl/wasm-pack" \ + /usr/local/bin/wasm-pack + wasm-pack --version - name: Build WASM ESM package run: make build-wasm - uses: actions/upload-artifact@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 35a8512..8ee5847 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -161,9 +161,21 @@ jobs: with: targets: wasm32-unknown-unknown - uses: Swatinem/rust-cache@v2 - - uses: jetli/wasm-pack-action@v0.4.0 - with: - version: latest + - name: Install wasm-pack + env: + WASM_PACK_VERSION: v0.15.0 + run: | + set -euo pipefail + tmp="$(mktemp -d)" + archive="wasm-pack-${WASM_PACK_VERSION}-x86_64-unknown-linux-musl.tar.gz" + curl -fsSL \ + "https://github.com/wasm-bindgen/wasm-pack/releases/download/${WASM_PACK_VERSION}/${archive}" \ + -o "${tmp}/${archive}" + tar -xzf "${tmp}/${archive}" -C "${tmp}" + sudo install -m 0755 \ + "${tmp}/wasm-pack-${WASM_PACK_VERSION}-x86_64-unknown-linux-musl/wasm-pack" \ + /usr/local/bin/wasm-pack + wasm-pack --version - name: Build WASM ESM package run: make build-wasm - uses: actions/upload-artifact@v4