From fa58992feb2a95add064d0b240d1c974f179f98d Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:40:27 -0700 Subject: [PATCH 01/10] feat(token-reduction): RTK filters, request bypass, MCP tool dedup, caveman MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port category-A token-reduction features inspired by 9router: - tool-result-compressor: add grep (per-file match cap), dedup_log (collapse repeated lines), and smart_truncate (head/tail fallback) compressors. - bypass: short-circuit Claude CLI housekeeping (Warmup, count, title prefill, isNewTopic title extraction) with a canned response — no provider round-trip. Always on; CLI-only so real work is never affected. - tool-dedup: strip built-in WebSearch/WebFetch when an equivalent MCP tool (Exa/Tavily) is present. Always on. - caveman: opt-in terse-output system-prompt injector (lite/full/ultra) to reduce output tokens; off by default. Adds test/token-reduction.test.js (17 cases). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/context/caveman.js | 94 +++++++++++++ src/context/tool-dedup.js | 95 ++++++++++++++ src/context/tool-result-compressor.js | 106 +++++++++++++++ src/orchestrator/bypass.js | 135 +++++++++++++++++++ test/token-reduction.test.js | 182 ++++++++++++++++++++++++++ 5 files changed, 612 insertions(+) create mode 100644 src/context/caveman.js create mode 100644 src/context/tool-dedup.js create mode 100644 src/orchestrator/bypass.js create mode 100644 test/token-reduction.test.js diff --git a/src/context/caveman.js b/src/context/caveman.js new file mode 100644 index 0000000..550b201 --- /dev/null +++ b/src/context/caveman.js @@ -0,0 +1,94 @@ +/** + * Caveman Terse-Output Injector + * + * Appends a brevity instruction to the system prompt so the model produces + * terser responses, reducing OUTPUT tokens. Opt-in and off by default — it + * changes model behavior, so it's only applied when explicitly enabled. + * + * Enable with CAVEMAN_ENABLED=true. Level via CAVEMAN_LEVEL=lite|full|ultra + * (default: lite). Adapted from 9router's caveman injector / the caveman skill + * (https://github.com/JuliusBrussee/caveman). + * + * @module context/caveman + */ + +const config = require("../config"); +const logger = require("../logger"); + +const LEVELS = ["lite", "full", "ultra"]; + +// Shared guardrails so brevity never corrupts the substance that matters. +const BOUNDARIES = + "Code blocks, file paths, commands, errors, URLs: keep exact. " + + "Security warnings, irreversible-action confirmations, and multi-step ordered " + + "sequences: write in full normal prose. Resume terse style afterward."; + +const EXAMPLES = + 'Not: "Sure! I\'d be happy to help. The issue is likely caused by..." ' + + 'Yes: "Bug in auth middleware. Token expiry uses `<` not `<=`. Fix:"'; + +const PERSISTENCE = "Apply this to every response unless a guardrail above applies."; + +const PROMPTS = { + lite: [ + "Respond tersely. Keep grammar and full sentences but drop filler, hedging, and pleasantries (just/really/basically/sure/of course/I'd be happy to).", + "Pattern: state the thing, the action, the reason. Then the next step.", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), + + full: [ + "Respond like a terse caveman. All technical substance stays exact; only fluff dies.", + "Drop articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries, and hedging. Fragments OK. Prefer short synonyms (big not extensive, fix not implement a solution for).", + "Pattern: [thing] [action] [reason]. [next step].", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), + + ultra: [ + "Respond ultra-terse. Maximum compression. Telegraphic.", + "Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, use arrows for causality (X → Y). One word when one word is enough.", + "Pattern: [thing] → [result]. [fix].", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), +}; + +const MARKER = "[brevity]"; + +/** Resolve the configured level, falling back to "lite". */ +function resolveLevel(level) { + const l = String(level || config.caveman?.level || "lite").toLowerCase(); + return LEVELS.includes(l) ? l : "lite"; +} + +/** + * Append the brevity instruction to a system prompt string. + * Idempotent — won't double-inject if the marker is already present. + * + * @param {string} system - Existing system prompt (may be empty). + * @param {object} [opts] + * @param {boolean} [opts.enabled] - Override config enablement. + * @param {string} [opts.level] - Override level. + * @returns {string} system prompt, possibly with brevity instruction appended. + */ +function injectCaveman(system, opts = {}) { + const enabled = opts.enabled ?? config.caveman?.enabled === true; + if (!enabled) return system || ""; + + const base = system || ""; + if (base.includes(MARKER)) return base; + + const level = resolveLevel(opts.level); + const instruction = `\n\n${MARKER} ${PROMPTS[level]}`; + logger.debug({ level }, "[Caveman] Injected brevity instruction into system prompt"); + return base + instruction; +} + +module.exports = { + injectCaveman, + LEVELS, +}; diff --git a/src/context/tool-dedup.js b/src/context/tool-dedup.js new file mode 100644 index 0000000..65f0aba --- /dev/null +++ b/src/context/tool-dedup.js @@ -0,0 +1,95 @@ +/** + * MCP-aware Tool Dedup + * + * Strips built-in tool definitions when an equivalent MCP tool is present in + * the request. Sending both wastes tool-schema tokens and gives the model + * redundant choices. Rule-based and deterministic. + * + * Example: if the Exa or Tavily MCP search tools are present, the built-in + * WebSearch/WebFetch tools are redundant and dropped. + * + * Ported from 9router's toolDeduper. Always on — purely removes redundant + * tool definitions, never adds. + * + * @module context/tool-dedup + */ + +const logger = require("../logger"); + +// Each rule: if any `triggers` tool is present, strip any tools matching +// `strip`. Patterns may be exact strings or RegExp (matched against the name). +const DEDUP_RULES = [ + { + // Exa MCP present → drop built-in web tools (Exa is preferred). + triggers: ["mcp__exa__web_search_exa", "mcp__exa__web_fetch_exa"], + strip: ["WebSearch", "WebFetch", "web_search", "web_fetch", "mcp__workspace__web_fetch"], + }, + { + // Tavily MCP present → drop built-in web tools. + triggers: ["mcp__tavily__tavily_search", "mcp__tavily__tavily_extract"], + strip: ["WebSearch", "WebFetch", "web_search", "web_fetch", "mcp__workspace__web_fetch"], + }, + { + // Browser MCP present → drop a duplicate Chrome-connector tool family. + triggers: [/^mcp__browsermcp__/], + strip: [/^mcp__Claude_in_Chrome__/], + }, +]; + +function getToolName(t) { + return t?.name || t?.function?.name || ""; +} + +function matches(name, pattern) { + if (typeof pattern === "string") return name === pattern; + return pattern instanceof RegExp ? pattern.test(name) : false; +} + +/** + * Remove redundant built-in tools that are superseded by present MCP tools. + * + * @param {Array} tools - Tool definitions (Anthropic or OpenAI shape). + * @returns {{tools: Array, stripped: string[]}} filtered tools + names removed. + */ +function dedupeTools(tools) { + if (!Array.isArray(tools) || tools.length === 0) return { tools, stripped: [] }; + + const names = tools.map(getToolName); + const toStrip = new Set(); + + for (const rule of DEDUP_RULES) { + const hasTrigger = names.some((n) => rule.triggers.some((p) => matches(n, p))); + if (!hasTrigger) continue; + for (const n of names) { + // Never strip a tool that is itself a trigger. + if (rule.triggers.some((p) => matches(n, p))) continue; + if (rule.strip.some((p) => matches(n, p))) toStrip.add(n); + } + } + + if (toStrip.size === 0) return { tools, stripped: [] }; + + const out = tools.filter((t) => !toStrip.has(getToolName(t))); + return { tools: out, stripped: Array.from(toStrip) }; +} + +/** + * Apply tool dedup to a payload in place. No-op when nothing is stripped. + * + * @param {object} payload - Request body with a `tools` array. + * @returns {string[]} names of stripped tools. + */ +function applyToolDedup(payload) { + if (!payload || !Array.isArray(payload.tools)) return []; + const { tools, stripped } = dedupeTools(payload.tools); + if (stripped.length > 0) { + payload.tools = tools; + logger.debug({ stripped }, "[ToolDedup] Stripped redundant built-in tools (MCP equivalents present)"); + } + return stripped; +} + +module.exports = { + dedupeTools, + applyToolDedup, +}; diff --git a/src/context/tool-result-compressor.js b/src/context/tool-result-compressor.js index c538d5b..9171b16 100644 --- a/src/context/tool-result-compressor.js +++ b/src/context/tool-result-compressor.js @@ -455,6 +455,107 @@ function compressContainerOutput(text) { return `${header}\n${dataLines.slice(0, 10).join("\n")}\n... +${dataLines.length - 10} more (${dataLines.length} total)`; } +// 11. Grep / ripgrep output ("file:lineno:content"), per-file match cap. +// Ported from 9router RTK grep filter (rtk/src/cmds/system/pipe_cmd.rs). +const GREP_PER_FILE_MAX = 10; +function compressGrep(text) { + const byFile = new Map(); + let total = 0; + + for (const line of text.split("\n")) { + // splitn(3, ':') — only split on the first two colons. + const first = line.indexOf(":"); + if (first === -1) continue; + const second = line.indexOf(":", first + 1); + if (second === -1) continue; + const file = line.slice(0, first); + const lineNumStr = line.slice(first + 1, second); + const content = line.slice(second + 1); + if (!/^\d+$/.test(lineNumStr)) continue; + total++; + if (!byFile.has(file)) byFile.set(file, []); + byFile.get(file).push([lineNumStr, content]); + } + + // Require a meaningful number of matches so we don't mangle prose that + // happens to contain a "word:123:..." line. + if (total < 5) return null; + + const files = Array.from(byFile.keys()).sort(); + let out = `${total} matches in ${files.length}F:\n\n`; + for (const file of files) { + const matches = byFile.get(file); + out += `[file] ${file} (${matches.length}):\n`; + for (const [lineNum, content] of matches.slice(0, GREP_PER_FILE_MAX)) { + out += ` ${lineNum.padStart(4)}: ${content.trim()}\n`; + } + if (matches.length > GREP_PER_FILE_MAX) { + out += ` +${matches.length - GREP_PER_FILE_MAX}\n`; + } + out += "\n"; + } + return out; +} + +// 12. Generic log de-duplication: collapse consecutive duplicate lines and +// runs of blank lines, with a hard line cap. Ported from 9router RTK dedupLog. +const DEDUP_LINE_MAX = 2000; +function compressDedupLog(text) { + const lines = text.split("\n"); + const out = []; + let prev = null; + let runCount = 0; + let blankStreak = 0; + + const flushRun = () => { + if (prev !== null && runCount > 1) { + out.push(` ... (${runCount - 1} duplicate lines)`); + } + }; + + for (const line of lines) { + if (line.trim() === "") { + if (blankStreak < 1) out.push(line); + blankStreak += 1; + flushRun(); + prev = null; + runCount = 0; + continue; + } + blankStreak = 0; + if (line === prev) { + runCount += 1; + continue; + } + flushRun(); + out.push(line); + prev = line; + runCount = 1; + if (out.length >= DEDUP_LINE_MAX) { + out.push(`... (truncated at ${DEDUP_LINE_MAX} lines)`); + return out.join("\n"); + } + } + flushRun(); + return out.join("\n"); +} + +// 13. Last-resort generic truncation: keep head + tail lines, drop the middle. +// Only kicks in for very long output no specific compressor matched. +// Ported from 9router RTK smartTruncate. +const SMART_TRUNCATE_HEAD = 120; +const SMART_TRUNCATE_TAIL = 60; +const SMART_TRUNCATE_MIN_LINES = 250; +function compressSmartTruncate(text) { + const lines = text.split("\n"); + if (lines.length < SMART_TRUNCATE_MIN_LINES) return null; + + const head = lines.slice(0, SMART_TRUNCATE_HEAD); + const tail = lines.slice(lines.length - SMART_TRUNCATE_TAIL); + const cut = lines.length - head.length - tail.length; + return [...head, `... +${cut} lines truncated`, ...tail].join("\n"); +} + // ── Compression Pipeline ───────────────────────────────────────────── const COMPRESSORS = [ @@ -466,8 +567,13 @@ const COMPRESSORS = [ { name: "build_output", fn: compressBuildOutput }, { name: "container_output", fn: compressContainerOutput }, { name: "json_response", fn: compressJSON }, + { name: "grep_output", fn: compressGrep }, { name: "directory_listing", fn: compressDirectoryListing }, { name: "large_file", fn: compressLargeFile }, + // Generic fallbacks last: dedup exact-duplicate spam, then hard head/tail + // truncation only if nothing more specific applied. + { name: "dedup_log", fn: compressDedupLog }, + { name: "smart_truncate", fn: compressSmartTruncate }, ]; // Compression levels tied to routing tiers diff --git a/src/orchestrator/bypass.js b/src/orchestrator/bypass.js new file mode 100644 index 0000000..b47a567 --- /dev/null +++ b/src/orchestrator/bypass.js @@ -0,0 +1,135 @@ +/** + * Request Bypass + * + * Short-circuits Claude Code CLI housekeeping requests that don't need a real + * model call: + * - "Warmup" pings the CLI sends to prime a connection + * - Topic/title extraction (the CLI asks for {"isNewTopic":..,"title":..}) + * - Single-word "count" / "Warmup" probes + * + * Returning a canned response here saves a full provider round-trip (latency + * and tokens) on every session. Inspired by 9router's bypassHandler. + * + * Always on — only ever returns a canned response for unambiguous Claude CLI + * housekeeping traffic, never for real work. + * + * @module orchestrator/bypass + */ + +const logger = require("../logger"); + +/** Flatten Anthropic content (string | block[]) into plain text. */ +function getText(content) { + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .filter((b) => b && b.type === "text" && typeof b.text === "string") + .map((b) => b.text) + .join(" "); + } + return ""; +} + +/** Flatten the top-level Anthropic `system` field (string | block[]). */ +function getSystemText(system) { + if (typeof system === "string") return system; + if (Array.isArray(system)) { + return system + .filter((s) => s && s.type === "text" && typeof s.text === "string") + .map((s) => s.text) + .join(" "); + } + return ""; +} + +/** + * Decide whether a request is a bypassable Claude CLI housekeeping call. + * + * @param {object} args + * @param {object} args.payload - The Anthropic request body. + * @param {object} [args.headers] - Lowercased request headers. + * @returns {{kind: string, text: string}|null} bypass descriptor or null. + */ +function detectBypass({ payload, headers = {} }) { + if (!payload || !Array.isArray(payload.messages) || payload.messages.length === 0) { + return null; + } + + // Only bypass Claude CLI traffic — other clients use these endpoints for + // real work and must never receive a canned response. + const ua = String(headers["user-agent"] || "").toLowerCase(); + if (!ua.includes("claude-cli")) return null; + + const messages = payload.messages; + const lastMsg = messages[messages.length - 1]; + + // Pattern 1: Title prefill — the CLI seeds an assistant turn with just "{" + // to coax a JSON object out of the model. + if (lastMsg?.role === "assistant") { + const firstBlockText = + Array.isArray(lastMsg.content) && lastMsg.content[0]?.type === "text" + ? lastMsg.content[0].text + : typeof lastMsg.content === "string" + ? lastMsg.content + : ""; + if (firstBlockText.trim() === "{") { + return { kind: "title_prefill", text: "{}" }; + } + } + + // Pattern 2: Topic/title extraction — system prompt asks for isNewTopic. + // Synthesize a title from the first user message instead of calling a model. + const systemText = getSystemText(payload.system); + if (systemText.includes("isNewTopic")) { + const userMsg = messages.find((m) => m.role === "user"); + const userText = getText(userMsg?.content).trim(); + const title = userText.split(/\s+/).filter(Boolean).slice(0, 3).join(" "); + return { + kind: "title_extraction", + text: JSON.stringify({ isNewTopic: true, title }), + }; + } + + // Pattern 3: Warmup / count probes — a single short user message. + if (messages.length === 1 && messages[0]?.role === "user") { + const firstText = getText(messages[0].content).trim(); + if (firstText === "Warmup" || firstText === "count") { + return { kind: firstText.toLowerCase(), text: "OK" }; + } + } + + return null; +} + +/** + * Build the processMessage-shaped response for a bypass descriptor. + * Matches the `{ status, body, terminationReason }` contract the router + * consumes (same shape as the prompt-cache early returns). + * + * @param {{kind: string, text: string}} bypass + * @param {string} model - Model id to echo back. + * @returns {{status: number, body: object, terminationReason: string}} + */ +function buildBypassResponse(bypass, model) { + logger.info({ kind: bypass.kind }, "[Bypass] Short-circuiting CLI housekeeping request"); + return { + status: 200, + body: { + id: `msg_bypass_${Date.now()}`, + type: "message", + role: "assistant", + content: [{ type: "text", text: bypass.text }], + model: model || "claude-3-unknown", + stop_reason: "end_turn", + stop_sequence: null, + usage: { input_tokens: 1, output_tokens: 1 }, + lynkr_bypass: { kind: bypass.kind }, + }, + terminationReason: `bypass_${bypass.kind}`, + }; +} + +module.exports = { + detectBypass, + buildBypassResponse, +}; diff --git a/test/token-reduction.test.js b/test/token-reduction.test.js new file mode 100644 index 0000000..01363ef --- /dev/null +++ b/test/token-reduction.test.js @@ -0,0 +1,182 @@ +const assert = require("assert"); +const { describe, it } = require("node:test"); + +const { compressToolResults, getMetrics } = require("../src/context/tool-result-compressor"); +const { detectBypass, buildBypassResponse } = require("../src/orchestrator/bypass"); +const { dedupeTools } = require("../src/context/tool-dedup"); +const { injectCaveman } = require("../src/context/caveman"); + +// Helper: wrap a tool_result string in a message and compress it. +function compressOne(text, tier = "SIMPLE") { + const messages = [ + { role: "user", content: [{ type: "tool_result", tool_use_id: "t1", content: text }] }, + ]; + const res = compressToolResults(messages, { tier }); + return { out: messages[0].content[0].content, res }; +} + +describe("RTK filters — grep", () => { + it("groups grep matches by file and caps per-file output", () => { + const lines = []; + for (let i = 1; i <= 30; i++) lines.push(`src/app.js:${i}:const x = ${i};`); + for (let i = 1; i <= 5; i++) lines.push(`src/util.js:${i}:helper(${i});`); + const { out } = compressOne(lines.join("\n")); + assert.ok(out.includes("35 matches in 2F"), `got: ${out.slice(0, 80)}`); + assert.ok(out.includes("[file] src/app.js (30)")); + assert.ok(out.includes("+20"), "should cap at 10 per file and note the rest"); + // tee recovery pointer is appended + assert.ok(/\[full: tee_/.test(out)); + }); + + it("ignores prose that is not grep output", () => { + const text = "This is a normal paragraph.\nNo file:line:content here.\n".repeat(40); + const { out } = compressOne(text); + // grep should not fire; dedup_log collapses the repeated lines instead — but + // the point is the result is still valid text, not a grep summary. + assert.ok(!out.includes("matches in")); + }); +}); + +describe("RTK filters — dedup log", () => { + it("collapses consecutive duplicate lines", () => { + const text = "starting\n" + "retrying connection...\n".repeat(200) + "done\n"; + const { out } = compressOne(text); + assert.ok(out.includes("duplicate lines"), `got: ${out.slice(0, 120)}`); + assert.ok(out.length < text.length * 0.7); + }); +}); + +describe("RTK filters — smart truncate", () => { + it("keeps head and tail of very long unmatched output", () => { + const lines = []; + for (let i = 0; i < 400; i++) lines.push(`unique log line number ${i} ${Math.random()}`); + const { out } = compressOne(lines.join("\n")); + assert.ok(out.includes("lines truncated"), `got tail: ${out.slice(-80)}`); + assert.ok(out.includes("unique log line number 0")); + assert.ok(out.includes("unique log line number 399")); + }); +}); + +describe("request bypass", () => { + const cliHeaders = { "user-agent": "claude-cli/1.0.0" }; + + it("bypasses Warmup pings from the Claude CLI", () => { + const b = detectBypass({ + payload: { messages: [{ role: "user", content: "Warmup" }] }, + headers: cliHeaders, + }); + assert.ok(b, "expected bypass"); + assert.strictEqual(b.kind, "warmup"); + }); + + it("synthesizes a title for topic-extraction requests", () => { + const b = detectBypass({ + payload: { + system: "Analyze if this is a new topic. Respond with isNewTopic and title.", + messages: [{ role: "user", content: "refactor the auth middleware please" }], + }, + headers: cliHeaders, + }); + assert.ok(b); + assert.strictEqual(b.kind, "title_extraction"); + const parsed = JSON.parse(b.text); + assert.strictEqual(parsed.isNewTopic, true); + assert.strictEqual(parsed.title, "refactor the auth"); + }); + + it("handles the '{' title-prefill pattern", () => { + const b = detectBypass({ + payload: { + messages: [ + { role: "user", content: "hi" }, + { role: "assistant", content: [{ type: "text", text: "{" }] }, + ], + }, + headers: cliHeaders, + }); + assert.ok(b); + assert.strictEqual(b.kind, "title_prefill"); + }); + + it("does NOT bypass non-CLI clients", () => { + const b = detectBypass({ + payload: { messages: [{ role: "user", content: "Warmup" }] }, + headers: { "user-agent": "cursor/0.4" }, + }); + assert.strictEqual(b, null); + }); + + it("does NOT bypass a real coding question from the CLI", () => { + const b = detectBypass({ + payload: { messages: [{ role: "user", content: "write a binary search in python" }] }, + headers: cliHeaders, + }); + assert.strictEqual(b, null); + }); + + it("builds a valid Anthropic message response", () => { + const r = buildBypassResponse({ kind: "warmup", text: "OK" }, "claude-x"); + assert.strictEqual(r.status, 200); + assert.strictEqual(r.body.type, "message"); + assert.strictEqual(r.body.content[0].text, "OK"); + assert.strictEqual(r.body.model, "claude-x"); + assert.strictEqual(r.terminationReason, "bypass_warmup"); + }); +}); + +describe("MCP-aware tool dedup", () => { + it("strips built-in web tools when Exa MCP is present", () => { + const tools = [ + { name: "mcp__exa__web_search_exa" }, + { name: "WebSearch" }, + { name: "WebFetch" }, + { name: "Read" }, + ]; + const { tools: out, stripped } = dedupeTools(tools); + assert.deepStrictEqual(stripped.sort(), ["WebFetch", "WebSearch"]); + assert.ok(out.some((t) => t.name === "mcp__exa__web_search_exa")); + assert.ok(out.some((t) => t.name === "Read")); + assert.ok(!out.some((t) => t.name === "WebSearch")); + }); + + it("is a no-op when no trigger MCP tool is present", () => { + const tools = [{ name: "WebSearch" }, { name: "Read" }]; + const { tools: out, stripped } = dedupeTools(tools); + assert.deepStrictEqual(stripped, []); + assert.strictEqual(out.length, 2); + }); + + it("supports OpenAI-shaped tool definitions", () => { + const tools = [ + { type: "function", function: { name: "mcp__tavily__tavily_search" } }, + { type: "function", function: { name: "WebFetch" } }, + ]; + const { stripped } = dedupeTools(tools); + assert.deepStrictEqual(stripped, ["WebFetch"]); + }); +}); + +describe("caveman injector", () => { + it("is a no-op when disabled", () => { + const sys = "You are a helpful assistant."; + assert.strictEqual(injectCaveman(sys, { enabled: false }), sys); + }); + + it("appends a brevity instruction when enabled", () => { + const out = injectCaveman("base prompt", { enabled: true, level: "lite" }); + assert.ok(out.startsWith("base prompt")); + assert.ok(out.includes("[brevity]")); + assert.ok(out.includes("terse")); + }); + + it("is idempotent (no double injection)", () => { + const once = injectCaveman("base", { enabled: true }); + const twice = injectCaveman(once, { enabled: true }); + assert.strictEqual(once, twice); + }); + + it("falls back to lite for an unknown level", () => { + const out = injectCaveman("", { enabled: true, level: "bogus" }); + assert.ok(out.includes("[brevity]")); + }); +}); From 9cf8f64d394817e1f15e35f6ade2a0d871a9b020 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:40:36 -0700 Subject: [PATCH 02/10] =?UTF-8?q?fix(routing):=20de-escalate=20risk=20fals?= =?UTF-8?q?e-positives=20and=20add=20session=E2=86=92provider=20affinity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - risk-analyzer: remove the over-broad 'session' and 'token' substring keywords that force-escalated ordinary requests to COMPLEX (they matched benign paths like src/sessions/* and tokenizer.js). Real secrets stay covered by secret/credential/api-key. Genuine auth/security paths still flag high. - session-affinity: new module + wiring in determineProviderSmart. When a conversation already carries tool_use/tool_result history, reuse the provider it first routed to, so tool_call_id linkage doesn't break across providers (the Azure/Moonshot 400 "tool_call_id not found" errors). Fresh turns still route per-tier. Adds test/session-affinity.test.js. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/routing/index.js | 39 ++++++++++++++ src/routing/risk-analyzer.js | 9 +++- src/routing/session-affinity.js | 96 +++++++++++++++++++++++++++++++++ test/session-affinity.test.js | 64 ++++++++++++++++++++++ 4 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 src/routing/session-affinity.js create mode 100644 test/session-affinity.test.js diff --git a/src/routing/index.js b/src/routing/index.js index 93c270b..b760fc3 100644 --- a/src/routing/index.js +++ b/src/routing/index.js @@ -138,7 +138,46 @@ function getBestLocalProvider() { * @param {Object} options - Routing options * @returns {Object} Routing decision with provider and metadata */ +const sessionAffinity = require('./session-affinity'); + +/** + * Provider routing with session affinity. + * + * When a conversation already carries tool history, reuse the provider the + * session first routed to so tool-call IDs don't break across providers. + * Fresh turns route normally and refresh the session's pinned provider. + */ async function determineProviderSmart(payload, options = {}) { + const sessionId = payload?._sessionId || null; + + // Enforce affinity only for in-flight tool exchanges — the turns that 400 + // if the provider changes. Fresh turns keep full per-turn tier routing. + if (sessionId && !options.forceProvider && sessionAffinity.payloadHasToolHistory(payload)) { + const pinned = sessionAffinity.getPinned(sessionId); + if (pinned) { + logger.debug({ sessionId, provider: pinned.provider, tier: pinned.tier }, + '[Routing] Session affinity — reusing provider for tool-bearing turn'); + return { + provider: pinned.provider, + model: pinned.model, + tier: pinned.tier, + method: 'session_affinity', + reason: 'tool_history_provider_pin', + }; + } + } + + const decision = await _determineProviderSmartInner(payload, options); + + // Remember the chosen provider so later tool-bearing turns stay consistent. + if (sessionId && decision?.provider && !options.forceProvider) { + sessionAffinity.setPinned(sessionId, decision); + } + + return decision; +} + +async function _determineProviderSmartInner(payload, options = {}) { const primaryProvider = config.modelProvider?.type ?? 'databricks'; // Risk analysis runs orthogonally to complexity. We compute it once diff --git a/src/routing/risk-analyzer.js b/src/routing/risk-analyzer.js index efd8281..78c402c 100644 --- a/src/routing/risk-analyzer.js +++ b/src/routing/risk-analyzer.js @@ -13,13 +13,18 @@ const { extractContent } = require('./complexity-analyzer'); // Substring keywords found in file paths or instruction text. // Matched case-insensitively as raw substrings, so "auth" hits // "src/auth/login.ts" and "authentication". +// NOTE: keywords are matched as case-insensitive *substrings* against file +// paths, so overly generic terms cause false positives. 'session' and 'token' +// were removed because they match benign paths (src/sessions/*, tokenizer.js, +// token-budget.js) and were force-escalating ordinary requests to COMPLEX — +// real secrets/credentials are still covered by the keywords below. const PROTECTED_PATH_KEYWORDS = [ - 'auth', 'oauth', 'jwt', 'session', 'security', 'permission', 'rbac', + 'auth', 'oauth', 'jwt', 'security', 'permission', 'rbac', 'payment', 'payments', 'billing', 'invoice', 'subscription', 'migration', 'migrations', 'schema', 'infra', 'terraform', 'kustomize', 'helm', 'kubernetes', '.github/workflows', '.env', 'secret', 'credential', - 'api-key', 'api_key', 'apikey', 'token', + 'api-key', 'api_key', 'apikey', 'webhook', 'admin', ]; diff --git a/src/routing/session-affinity.js b/src/routing/session-affinity.js new file mode 100644 index 0000000..5f76f82 --- /dev/null +++ b/src/routing/session-affinity.js @@ -0,0 +1,96 @@ +/** + * Session → Provider Affinity + * + * A multi-turn agentic conversation builds up tool_use / tool_result history + * whose tool-call IDs are formatted for the provider that produced them. If a + * later turn re-routes to a *different* provider (because per-turn complexity + * or risk changed), that provider rejects the orphaned tool linkage: + * + * Azure: 400 "No tool call found for function call output with call_id …" + * Moonshot: 400 "Invalid request: tool_call_id is not found" + * + * To prevent that, once a session has chosen a provider we keep subsequent + * turns on it *while the payload carries tool history*. Fresh turns (no tool + * state) still route normally, so per-turn tier routing is preserved. + * + * @module routing/session-affinity + */ + +const MAX_ENTRIES = 2000; +const TTL_MS = 60 * 60 * 1000; // 1 hour + +/** @type {Map} */ +const pins = new Map(); + +function _evictIfNeeded() { + if (pins.size <= MAX_ENTRIES) return; + // Map preserves insertion order — drop the oldest. + const oldest = pins.keys().next().value; + if (oldest !== undefined) pins.delete(oldest); +} + +/** + * True when the payload contains an in-flight tool exchange — i.e. a prior + * assistant tool_use or a user tool_result. These are the turns whose + * tool-call IDs break if the provider changes. + * @param {object} payload + * @returns {boolean} + */ +function payloadHasToolHistory(payload) { + const messages = payload?.messages; + if (!Array.isArray(messages)) return false; + for (const msg of messages) { + const content = msg?.content; + if (!Array.isArray(content)) continue; + for (const block of content) { + const t = block?.type; + if (t === "tool_use" || t === "tool_result") return true; + } + } + return false; +} + +/** + * Return the pinned routing decision for a session, or null if none / expired. + * @param {string} sessionId + */ +function getPinned(sessionId) { + if (!sessionId) return null; + const entry = pins.get(sessionId); + if (!entry) return null; + if (Date.now() - entry.ts > TTL_MS) { + pins.delete(sessionId); + return null; + } + return entry; +} + +/** + * Record the provider a session routed to, for reuse on later tool-bearing turns. + * @param {string} sessionId + * @param {{provider:string, model?:string|null, tier?:string|null}} decision + */ +function setPinned(sessionId, decision) { + if (!sessionId || !decision?.provider) return; + // Refresh insertion order so active sessions aren't evicted. + pins.delete(sessionId); + pins.set(sessionId, { + provider: decision.provider, + model: decision.model ?? null, + tier: decision.tier ?? null, + ts: Date.now(), + }); + _evictIfNeeded(); +} + +/** Test/maintenance helper. */ +function _clear() { + pins.clear(); +} + +module.exports = { + payloadHasToolHistory, + getPinned, + setPinned, + _clear, +}; diff --git a/test/session-affinity.test.js b/test/session-affinity.test.js new file mode 100644 index 0000000..8533d99 --- /dev/null +++ b/test/session-affinity.test.js @@ -0,0 +1,64 @@ +const assert = require("assert"); +const { describe, it, beforeEach } = require("node:test"); + +const affinity = require("../src/routing/session-affinity"); + +describe("session-affinity: payloadHasToolHistory", () => { + it("is false for a plain text conversation", () => { + const payload = { messages: [{ role: "user", content: "explain this repo" }] }; + assert.strictEqual(affinity.payloadHasToolHistory(payload), false); + }); + + it("is true when an assistant tool_use is present", () => { + const payload = { + messages: [ + { role: "user", content: "read the file" }, + { role: "assistant", content: [{ type: "tool_use", id: "t1", name: "Read", input: {} }] }, + ], + }; + assert.strictEqual(affinity.payloadHasToolHistory(payload), true); + }); + + it("is true when a user tool_result is present", () => { + const payload = { + messages: [ + { role: "user", content: [{ type: "tool_result", tool_use_id: "t1", content: "ok" }] }, + ], + }; + assert.strictEqual(affinity.payloadHasToolHistory(payload), true); + }); + + it("handles missing/!array messages safely", () => { + assert.strictEqual(affinity.payloadHasToolHistory({}), false); + assert.strictEqual(affinity.payloadHasToolHistory(null), false); + assert.strictEqual(affinity.payloadHasToolHistory({ messages: "x" }), false); + }); +}); + +describe("session-affinity: pin lifecycle", () => { + beforeEach(() => affinity._clear()); + + it("returns null when nothing is pinned", () => { + assert.strictEqual(affinity.getPinned("s1"), null); + }); + + it("round-trips a pinned decision", () => { + affinity.setPinned("s1", { provider: "moonshot", model: "moonshot-v1-auto", tier: "COMPLEX" }); + const got = affinity.getPinned("s1"); + assert.strictEqual(got.provider, "moonshot"); + assert.strictEqual(got.model, "moonshot-v1-auto"); + assert.strictEqual(got.tier, "COMPLEX"); + }); + + it("ignores empty session id or provider", () => { + affinity.setPinned("", { provider: "ollama" }); + affinity.setPinned("s2", { provider: undefined }); + assert.strictEqual(affinity.getPinned("s2"), null); + }); + + it("keeps the latest provider for a session", () => { + affinity.setPinned("s1", { provider: "ollama" }); + affinity.setPinned("s1", { provider: "azure-openai" }); + assert.strictEqual(affinity.getPinned("s1").provider, "azure-openai"); + }); +}); From 7dabc47f99b2db7e6614a4e68c49ebdf51f8b0ca Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:40:44 -0700 Subject: [PATCH 03/10] fix(orchestrator): pass client tools through, wire bypass/dedup/affinity - Gate smart tool selection to server mode only. In client/passthrough mode the client (Claude Code) owns tool execution, so stripping its tools made the model emit calls for removed tools that were then dropped as "hallucinated", stalling the session. Tools now pass through intact. - Wire request bypass (early), MCP tool dedup, and caveman injection into processMessage / runAgentLoop. - Thread session id (_sessionId) for provider affinity. - config: add caveman flags (CAVEMAN_ENABLED/LEVEL). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/config/index.js | 9 +++++++++ src/orchestrator/index.js | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/config/index.js b/src/config/index.js index 729f2fc..e4ac410 100644 --- a/src/config/index.js +++ b/src/config/index.js @@ -208,6 +208,11 @@ const tokenBudgetWarning = Number.parseInt(process.env.TOKEN_BUDGET_WARNING ?? " const tokenBudgetMax = Number.parseInt(process.env.TOKEN_BUDGET_MAX ?? "180000", 10); const tokenBudgetEnforcement = process.env.TOKEN_BUDGET_ENFORCEMENT !== "false"; // default true +// Caveman terse-output injection (opt-in, off by default) +const cavemanEnabled = process.env.CAVEMAN_ENABLED === "true"; +const cavemanLevel = (process.env.CAVEMAN_LEVEL ?? "lite").toLowerCase(); + + // TOON payload compression (opt-in) const toonEnabled = process.env.TOON_ENABLED === "true"; // default false const toonMinBytes = Number.parseInt(process.env.TOON_MIN_BYTES ?? "4096", 10); @@ -641,6 +646,10 @@ var config = { toolResultCompression: { enabled: true, }, + caveman: { + enabled: cavemanEnabled, + level: cavemanLevel, + }, server: { jsonLimit: process.env.REQUEST_JSON_LIMIT ?? "1gb", }, diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js index f1144b6..87d2cce 100644 --- a/src/orchestrator/index.js +++ b/src/orchestrator/index.js @@ -18,6 +18,7 @@ const { createAuditLogger } = require("../logger/audit-logger"); const { getResolvedIp, runWithDnsContext } = require("../clients/dns-logger"); const { getShuttingDown } = require("../api/health"); const { tryPreflight, buildSatisfiedResponse: buildPreflightResponse } = require("./preflight"); +const { detectBypass, buildBypassResponse } = require("./bypass"); const crypto = require("crypto"); const { asyncClone, asyncTransform, getPoolStats } = require("../workers/helpers"); const { getSemanticCache, isSemanticCacheEnabled } = require("../cache/semantic"); @@ -1362,8 +1363,12 @@ function sanitizePayload(payload) { delete clean.tool_choice; } - // Smart tool selection (universal, applies to all providers) - if (config.smartToolSelection?.enabled && Array.isArray(clean.tools) && clean.tools.length > 0) { + // Smart tool selection (server mode only). In client/passthrough mode the + // client (e.g. Claude Code) owns tool execution, so stripping its tools would + // make the model emit calls for tools we removed — they then get dropped as + // "hallucinated" and the session makes no progress. Pass tools through intact. + const inClientMode = config.toolExecutionMode === "client" || config.toolExecutionMode === "passthrough"; + if (!inClientMode && config.smartToolSelection?.enabled && Array.isArray(clean.tools) && clean.tools.length > 0) { const classification = classifyRequestType(clean); const selectedTools = selectToolsSmartly(clean.tools, classification, { provider: providerType, @@ -1977,6 +1982,12 @@ IMPORTANT TOOL USAGE RULES: cleanPayload._tenantPolicy = options.tenantPolicy; } + // Thread session id for provider affinity — keeps a tool-bearing + // conversation on one provider so tool_call_id linkage doesn't break. + if (session?.id) { + cleanPayload._sessionId = session.id; + } + // RTK-inspired tool result compression: compress large tool_results // before they reach the model (saves 60-90% on test/git/lint output) if (config.toolResultCompression?.enabled !== false) { @@ -1985,6 +1996,18 @@ IMPORTANT TOOL USAGE RULES: compressToolResults(cleanPayload.messages, { tier }); } + // MCP-aware tool dedup: drop built-in tools superseded by present MCP tools + // (e.g. WebSearch/WebFetch when Exa/Tavily MCP is available). Always on. + const { applyToolDedup } = require("../context/tool-dedup"); + applyToolDedup(cleanPayload); + + // Caveman terse-output injection (opt-in): nudge the model toward shorter + // responses to reduce output tokens. + if (config.caveman?.enabled === true) { + const { injectCaveman } = require("../context/caveman"); + cleanPayload.system = injectCaveman(cleanPayload.system); + } + if (agentTimer) agentTimer.mark("preInvokeModel"); let databricksResponse; try { @@ -3735,6 +3758,14 @@ async function processMessage({ payload, headers, session, cwd, options = {} }) }; } + // === REQUEST BYPASS === + // Claude CLI housekeeping (Warmup pings, topic/title extraction) doesn't + // need a model call — return a canned response and skip the provider. + const bypass = detectBypass({ payload, headers }); + if (bypass) { + return buildBypassResponse(bypass, requestedModel); + } + // === PREFLIGHT CHECK === // If the request supplied preflight_commands and they all pass in // the workspace, the work is already done — short-circuit with a From 735eb4681bb51f2e3c47f78ed2a3d7940dc84849 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:40:53 -0700 Subject: [PATCH 04/10] fix(providers,telemetry): Moonshot k2 params, cost capture, native ABI guard Providers: - Moonshot: remap deprecated moonshot-v1-auto (400 "tokenization failed") to a fixed model; pin temperature=1 and top_p=0.95 for kimi-k2.x thinking models (they reject other values). - openrouter-utils: replace empty-string message content with a space so Kimi/OpenAI-compatible providers don't 400 on "tokenization failed". Telemetry / cost: - telemetry: add request_text + response_text columns (+ migration for existing DBs); capture prompt/response text (truncated) and per-request cost_usd (model-registry pricing x tokens) on success + fallback paths. Builds the training corpus for the routing ML models. Native modules: - scripts/check-native.js + postinstall: detect a better-sqlite3 ABI mismatch (Node upgrade) and rebuild native optional deps. Best-effort, never fails install. Fixes telemetry/sessions/memory silently going dark after a Node bump. Co-Authored-By: Claude Opus 4.8 (1M context) --- package.json | 4 +- scripts/check-native.js | 97 +++++++++++++++++++++++++++++++++ src/clients/databricks.js | 72 +++++++++++++++++++++++- src/clients/openrouter-utils.js | 15 +++++ src/routing/telemetry.js | 19 ++++++- 5 files changed, 200 insertions(+), 7 deletions(-) create mode 100644 scripts/check-native.js diff --git a/package.json b/package.json index e5cb491..ea00306 100644 --- a/package.json +++ b/package.json @@ -8,13 +8,15 @@ "lynkr-setup": "scripts/setup.js" }, "scripts": { + "postinstall": "node scripts/check-native.js", + "rebuild-native": "node scripts/check-native.js", "prestart": "node -e \"if(process.env.HEADROOM_ENABLED==='true'&&process.env.HEADROOM_DOCKER_ENABLED!=='false'){process.exit(0)}else{process.exit(1)}\" && docker compose --profile headroom up -d --build headroom 2>/dev/null || echo 'Headroom skipped (disabled or Docker not running)'", "start": "node index.js 2>&1 | npx pino-pretty --sync", "stop": "node -e \"if(process.env.HEADROOM_ENABLED==='true'&&process.env.HEADROOM_DOCKER_ENABLED!=='false'){process.exit(0)}else{process.exit(1)}\" && docker compose --profile headroom down || echo 'Headroom skipped (disabled or Docker not running)'", "dev": "nodemon index.js", "lint": "eslint src index.js", "test": "npm run test:unit && npm run test:performance", - "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/resilience.test.js test/telemetry-routing.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js test/distill.test.js test/large-payload.test.js test/code-mode.test.js test/prompt-cache-injection.test.js test/risk-analyzer.test.js test/interaction-block.test.js test/preflight.test.js", + "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/resilience.test.js test/telemetry-routing.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js test/distill.test.js test/large-payload.test.js test/code-mode.test.js test/prompt-cache-injection.test.js test/risk-analyzer.test.js test/interaction-block.test.js test/preflight.test.js test/token-reduction.test.js test/session-affinity.test.js", "test:memory": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js", "test:new-features": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js", "test:performance": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/hybrid-routing-performance.test.js && DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/performance-tests.js", diff --git a/scripts/check-native.js b/scripts/check-native.js new file mode 100644 index 0000000..eba34ca --- /dev/null +++ b/scripts/check-native.js @@ -0,0 +1,97 @@ +#!/usr/bin/env node +/** + * Native module ABI guard (postinstall). + * + * better-sqlite3 (and the other native optionalDependencies) are compiled + * against a specific Node ABI. When Node is upgraded, the prebuilt/compiled + * binary stops loading with: + * + * "was compiled against a different Node.js version using + * NODE_MODULE_VERSION 115. This version of Node.js requires + * NODE_MODULE_VERSION 141." + * + * The failure is silent at runtime — telemetry, request logs, and the memory + * store all sit behind try/catch and simply go empty. This probe detects the + * mismatch and rebuilds the native modules so it self-heals on `npm install`. + * + * It is intentionally best-effort: it NEVER exits non-zero, so it can't break + * `npm install` on machines without a build toolchain (the modules are + * optional and the app degrades gracefully without them). + */ + +const { execSync } = require("child_process"); + +// Native optionalDependencies that are ABI-sensitive. If Node changed, all of +// them are stale, so we rebuild the set in one pass. +const NATIVE_DEPS = [ + "better-sqlite3", + "hnswlib-node", + "tree-sitter", + "tree-sitter-javascript", + "tree-sitter-python", + "tree-sitter-typescript", +]; + +function log(msg) { + console.log(`[check-native] ${msg}`); +} + +/** + * Probe better-sqlite3 — the canary. `require()` alone is not enough: the + * native addon only loads when a Database is instantiated. + * @returns {"ok"|"absent"|"mismatch"} + */ +function probe() { + let Database; + try { + Database = require("better-sqlite3"); + } catch (err) { + if (err && err.code === "MODULE_NOT_FOUND") return "absent"; + return "mismatch"; + } + try { + const db = new Database(":memory:"); + db.close(); + return "ok"; + } catch (err) { + if (/NODE_MODULE_VERSION|different Node\.js version|invalid ELF|dlopen|\.node/i.test(err.message || "")) { + return "mismatch"; + } + // Some other instantiation error — not an ABI issue we can fix by rebuild. + return "ok"; + } +} + +function main() { + const status = probe(); + + if (status === "absent") { + // Optional dependency not installed (e.g. build skipped). Nothing to do. + return; + } + if (status === "ok") { + return; + } + + log("native module ABI mismatch detected (Node was likely upgraded). Rebuilding native modules…"); + try { + execSync(`npm rebuild ${NATIVE_DEPS.join(" ")}`, { stdio: "inherit" }); + } catch { + log("rebuild did not complete (a build toolchain may be missing). Continuing — native features will be disabled until you run: npm rebuild better-sqlite3"); + return; + } + + // Re-probe to report the outcome. + if (probe() === "ok") { + log("native modules rebuilt successfully."); + } else { + log("native modules still not loadable after rebuild. Run `npm rebuild better-sqlite3` manually."); + } +} + +try { + main(); +} catch (err) { + // Never fail the install. + log(`skipped (${err.message})`); +} diff --git a/src/clients/databricks.js b/src/clients/databricks.js index ef9e244..f72302d 100644 --- a/src/clients/databricks.js +++ b/src/clients/databricks.js @@ -1506,10 +1506,16 @@ async function invokeMoonshot(body) { "claude-haiku-4-5-20251001": "kimi-k2-turbo-preview", "claude-haiku-4-5": "kimi-k2-turbo-preview", "claude-3-haiku": "kimi-k2-turbo-preview", + // moonshot-v1-auto 400s with "tokenization failed" (its server-side auto + // context-size pass fails on large tool-bearing payloads). Remap to a + // fixed model that's broadly available on api.moonshot.ai. + "moonshot-v1-auto": "moonshot-v1-128k", }; const requestedModel = body._tierModel || body.model || config.moonshot.model; - const mappedModel = modelMap[requestedModel] || config.moonshot.model || "kimi-k2-turbo-preview"; + let mappedModel = modelMap[requestedModel] || config.moonshot.model || "kimi-k2-turbo-preview"; + // Guard against the deprecated auto model arriving via config too. + if (mappedModel === "moonshot-v1-auto") mappedModel = "moonshot-v1-128k"; // Convert messages using existing utility const messages = convertAnthropicMessagesToOpenRouter(body.messages || []); @@ -1522,12 +1528,18 @@ async function invokeMoonshot(body) { messages.unshift({ role: "system", content: systemContent }); } + // kimi-k2.x (k2.5 / k2.6 …) are thinking models that only accept + // temperature: 1 — any other value 400s with "invalid temperature". + const isKimiThinking = /^kimi-k2/i.test(mappedModel); + const moonshotBody = { model: mappedModel, messages, max_tokens: body.max_tokens || 16384, - temperature: body.temperature ?? 0.7, - top_p: body.top_p ?? 1.0, + // kimi-k2.x thinking models pin sampling params: temperature must be 1 + // and top_p must be 0.95 — any other value 400s. + temperature: isKimiThinking ? 1 : (body.temperature ?? 0.7), + top_p: isKimiThinking ? 0.95 : (body.top_p ?? 1.0), stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented }; @@ -2027,6 +2039,54 @@ async function invokeCodex(body) { }; } +/** + * Compute request cost in USD from model pricing × token usage. + * Registry returns per-1M-token prices ({ input, output }); returns null when + * pricing is unknown so we don't record misleading zeros. + */ +function computeCostUsd(model, inputTokens, outputTokens) { + try { + const { getModelRegistrySync } = require("../routing/model-registry"); + const reg = getModelRegistrySync && getModelRegistrySync(); + const cost = reg?.getCost?.(model); + if (!cost || (cost.input == null && cost.output == null)) return null; + const inUsd = ((inputTokens || 0) / 1e6) * (cost.input || 0); + const outUsd = ((outputTokens || 0) / 1e6) * (cost.output || 0); + return Number((inUsd + outUsd).toFixed(6)); + } catch { + return null; + } +} + +// Telemetry prompt/response text is always captured (truncated) to build the +// routing ML training corpus. Stored locally in .lynkr/telemetry.db only. +const TELEMETRY_TEXT_MAXLEN = 2000; + +/** Flatten the latest user message to plain text (for telemetry capture). */ +function captureRequestText(body) { + const messages = body?.messages; + if (!Array.isArray(messages)) return null; + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i]; + if (m?.role !== "user") continue; + let text = ""; + if (typeof m.content === "string") text = m.content; + else if (Array.isArray(m.content)) { + text = m.content.filter((b) => b?.type === "text").map((b) => b.text || "").join(" "); + } + if (text) return text.slice(0, TELEMETRY_TEXT_MAXLEN); + } + return null; +} + +/** Flatten an Anthropic response's text blocks to plain text (for telemetry). */ +function captureResponseText(resultJson) { + const content = resultJson?.content; + if (!Array.isArray(content)) return null; + const text = content.filter((b) => b?.type === "text").map((b) => b.text || "").join(" "); + return text ? text.slice(0, TELEMETRY_TEXT_MAXLEN) : null; +} + async function invokeModel(body, options = {}) { const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing"); const metricsCollector = getMetricsCollector(); @@ -2233,6 +2293,9 @@ async function invokeModel(body, options = {}) { circuit_breaker_state: breaker.state, quality_score: qualityScore, tokens_per_second: outputTokens && latency > 0 ? outputTokens / (latency / 1000) : null, + cost_usd: computeCostUsd(routingDecision.model || body._tierModel, inputTokens, outputTokens), + request_text: captureRequestText(body), + response_text: captureResponseText(result.json), }); // Return result with provider info and routing decision for headers @@ -2394,6 +2457,9 @@ async function invokeModel(body, options = {}) { { status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime } ), tokens_per_second: fbOutputTokens && fallbackLatency > 0 ? fbOutputTokens / (fallbackLatency / 1000) : null, + cost_usd: computeCostUsd(routingDecision.model || body._tierModel, fbInputTokens, fbOutputTokens), + request_text: captureRequestText(body), + response_text: captureResponseText(fallbackResult.json), }); // Return result with actual provider used (fallback provider) and routing decision diff --git a/src/clients/openrouter-utils.js b/src/clients/openrouter-utils.js index 1a2daba..7978f8c 100644 --- a/src/clients/openrouter-utils.js +++ b/src/clients/openrouter-utils.js @@ -176,6 +176,21 @@ function convertAnthropicMessagesToOpenRouter(anthropicMessages) { } } + // Kimi/Moonshot (and some OpenAI-compatible APIs) reject a message whose + // content is an empty string with "Invalid request: tokenization failed". + // This happens when a turn had only non-text blocks (thinking / image / + // stripped content) and flattened to "". Replace empty/whitespace-only + // content with a single space — but never touch an assistant message that + // carries tool_calls, where content: null is intentional and required. + for (const m of converted) { + if (m.role === 'tool') continue; + const hasToolCalls = Array.isArray(m.tool_calls) && m.tool_calls.length > 0; + if (hasToolCalls) continue; + if (typeof m.content !== 'string' || m.content.trim() === '') { + m.content = ' '; + } + } + // Log the converted messages for debugging logger.debug({ inputCount: anthropicMessages.length, diff --git a/src/routing/telemetry.js b/src/routing/telemetry.js index 5d2a504..e606d35 100644 --- a/src/routing/telemetry.js +++ b/src/routing/telemetry.js @@ -94,7 +94,9 @@ function init() { circuit_breaker_state TEXT, quality_score REAL, tokens_per_second REAL, - cost_efficiency REAL + cost_efficiency REAL, + request_text TEXT, + response_text TEXT ); CREATE INDEX IF NOT EXISTS idx_telemetry_provider @@ -110,6 +112,15 @@ function init() { ON routing_telemetry(session_id, timestamp); `); + // Migration: add columns to pre-existing tables (CREATE TABLE IF NOT EXISTS + // won't add them to a DB created before these columns existed). + const existingCols = new Set(db.prepare("PRAGMA table_info(routing_telemetry)").all().map((c) => c.name)); + for (const col of ["request_text", "response_text"]) { + if (!existingCols.has(col)) { + db.exec(`ALTER TABLE routing_telemetry ADD COLUMN ${col} TEXT`); + } + } + logger.info({ dbPath }, "Routing telemetry database initialised"); return true; } catch (err) { @@ -163,14 +174,14 @@ function record(data) { provider, model, routing_method, was_fallback, output_tokens, latency_ms, status_code, error_type, cost_usd, tool_calls_made, retry_count, circuit_breaker_state, quality_score, tokens_per_second, - cost_efficiency + cost_efficiency, request_text, response_text ) VALUES ( @request_id, @session_id, @timestamp, @complexity_score, @tier, @agentic_type, @tool_count, @input_tokens, @message_count, @request_type, @provider, @model, @routing_method, @was_fallback, @output_tokens, @latency_ms, @status_code, @error_type, @cost_usd, @tool_calls_made, @retry_count, @circuit_breaker_state, @quality_score, @tokens_per_second, - @cost_efficiency + @cost_efficiency, @request_text, @response_text )` ); if (!insert) return; @@ -201,6 +212,8 @@ function record(data) { quality_score: data.quality_score ?? null, tokens_per_second: data.tokens_per_second ?? null, cost_efficiency: data.cost_efficiency ?? null, + request_text: data.request_text ?? null, + response_text: data.response_text ?? null, }); } catch (err) { logger.debug({ err: err.message }, "Telemetry record failed"); From dc948c9315a3d3f63846d2aa5521296ad6c01a78 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:41:01 -0700 Subject: [PATCH 05/10] feat(dashboard): tier-aware Configured Providers panel Derive the Configured Providers list from TIER_* + MODEL_PROVIDER intersected with configured credentials, instead of listing every provider that merely has an endpoint set (which always showed llamacpp/lmstudio via their default endpoints). Show the tiers each provider serves, and flag tiers pointing at a provider with no credentials. Also documents the caveman env flags in .env.example. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 6 +++ public/dashboard.html | 14 ++++++- src/dashboard/api.js | 87 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 88 insertions(+), 19 deletions(-) diff --git a/.env.example b/.env.example index f96b135..b98ca72 100644 --- a/.env.example +++ b/.env.example @@ -445,6 +445,12 @@ TOON_MIN_BYTES=4096 TOON_FAIL_OPEN=true TOON_LOG_STATS=true +# Caveman terse-output injection (opt-in): append a brevity instruction to the +# system prompt to reduce OUTPUT tokens. Off by default — changes model style. +# Levels: lite | full | ultra +CAVEMAN_ENABLED=false +CAVEMAN_LEVEL=lite + # ============================================================================== # Tiered Model Routing (REQUIRED) # ============================================================================== diff --git a/public/dashboard.html b/public/dashboard.html index 83bef39..5cd6ea8 100644 --- a/public/dashboard.html +++ b/public/dashboard.html @@ -244,6 +244,7 @@ const t = d.today; const s = d.stats; + const tierLabel = t => t === 'default' ? 'default' : String(t).toLowerCase(); const providerCards = d.providers.length === 0 ? `

No providers configured

` : d.providers.map(p => ` @@ -251,10 +252,21 @@
${p.name} + ${(p.tiers || []).map(t => `${tierLabel(t)}`).join('')}
${p.type} `).join(''); + const providerWarnings = (d.providerWarnings || []).map(w => ` +
+
+ + ${w.name} + ${(w.tiers || []).map(t => `${tierLabel(t)}`).join('')} +
+ no credentials +
`).join(''); + const recentRows = (d.recentRequests || []).map(r => ` ${fmt.ago(r.timestamp)} @@ -279,7 +291,7 @@ ${card(`

Configured Providers

-
${providerCards}
+
${providerCards}${providerWarnings}
`)} diff --git a/src/dashboard/api.js b/src/dashboard/api.js index 5e0399c..58c4373 100644 --- a/src/dashboard/api.js +++ b/src/dashboard/api.js @@ -5,24 +5,74 @@ const metrics = require('../metrics'); const { getMetricsCollector } = require('../observability/metrics'); const { TIER_DEFINITIONS } = require('../routing/model-tiers'); -function getConfiguredProviders() { +// Per-provider type + whether its credentials/endpoint are actually present. +function providerMeta() { const c = config; - const providers = []; - const add = (name, type, ok) => ok && providers.push({ name, type }); - - add('databricks', 'cloud', c.databricks?.url && c.databricks?.apiKey); - add('azure-anthropic','cloud', c.azureAnthropic?.endpoint && c.azureAnthropic?.apiKey); - add('bedrock', 'cloud', c.bedrock?.apiKey); - add('openrouter', 'cloud', c.openrouter?.apiKey); - add('openai', 'cloud', c.openai?.apiKey); - add('azure-openai', 'cloud', c.azureOpenAI?.endpoint && c.azureOpenAI?.apiKey); - add('vertex', 'cloud', c.vertex?.projectId); - add('moonshot', 'cloud', c.moonshot?.apiKey); - add('ollama', 'local', c.ollama?.endpoint); - add('llamacpp', 'local', c.llamacpp?.endpoint); - add('lmstudio', 'local', c.lmstudio?.endpoint); - - return providers; + return { + databricks: { type: 'cloud', configured: !!(c.databricks?.url && c.databricks?.apiKey) }, + 'azure-anthropic': { type: 'cloud', configured: !!(c.azureAnthropic?.endpoint && c.azureAnthropic?.apiKey) }, + bedrock: { type: 'cloud', configured: !!c.bedrock?.apiKey }, + openrouter: { type: 'cloud', configured: !!c.openrouter?.apiKey }, + openai: { type: 'cloud', configured: !!c.openai?.apiKey }, + 'azure-openai': { type: 'cloud', configured: !!(c.azureOpenAI?.endpoint && c.azureOpenAI?.apiKey) }, + vertex: { type: 'cloud', configured: !!c.vertex?.projectId }, + moonshot: { type: 'cloud', configured: !!c.moonshot?.apiKey }, + ollama: { type: 'local', configured: !!c.ollama?.endpoint }, + llamacpp: { type: 'local', configured: !!c.llamacpp?.endpoint }, + lmstudio: { type: 'local', configured: !!c.lmstudio?.endpoint }, + }; +} + +// Providers the active routing config actually points at: the provider prefix +// of each TIER_* value (format `provider:model[:variant]`) plus the base +// MODEL_PROVIDER. Returns Map. +function getReferencedProviders() { + const refs = new Map(); + const note = (provider, label) => { + const key = String(provider || '').trim().toLowerCase(); + if (!key) return; + if (!refs.has(key)) refs.set(key, []); + if (label && !refs.get(key).includes(label)) refs.get(key).push(label); + }; + + const tiers = config.modelTiers || {}; + for (const [tier, val] of Object.entries(tiers)) { + if (typeof val === 'string' && val.trim()) { + note(val.split(':')[0], tier); + } + } + note(config.modelProvider?.type, 'default'); + + return refs; +} + +// Providers used by the routing config that have credentials/endpoints set. +// Unknown providers (no metadata) are included optimistically since we can't +// verify their credentials. +function getConfiguredProviders() { + const meta = providerMeta(); + const out = []; + for (const [name, tiers] of getReferencedProviders()) { + const m = meta[name]; + if (!m || m.configured) { + out.push({ name, type: m?.type || 'cloud', tiers }); + } + } + return out; +} + +// Tiers pointing at a known provider whose credentials/endpoint are missing — +// surfaced as a warning so a misconfigured tier is visible. +function getProviderWarnings() { + const meta = providerMeta(); + const out = []; + for (const [name, tiers] of getReferencedProviders()) { + const m = meta[name]; + if (m && !m.configured) { + out.push({ name, type: m.type, tiers }); + } + } + return out; } // Noise provider names injected by unit tests — filter them out of UI @@ -92,7 +142,8 @@ function overview(req, res) { port: config.port, version: process.env.npm_package_version || '9.0.2', modelProvider: config.modelProvider?.type || 'unknown', - providers: getConfiguredProviders(), + providers: getConfiguredProviders(), + providerWarnings: getProviderWarnings(), statsWindow: win.label, metrics: { requestsTotal: snap.requestsTotal, From 320e5ee380bd48467e4871986f7ba10537986e5b Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Mon, 8 Jun 2026 21:48:32 -0700 Subject: [PATCH 06/10] fix(model-registry): deterministic cost resolution; drop fuzzy substring match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the bidirectional `includes()` fuzzy match (which returned confidently wrong prices for unrelated names) with a deterministic ladder: override → exact → provider-prefix-strip → alias → date-normalize → longest-prefix (one-directional, length-bounded) → unknown - Unknown models now return `{ ...DEFAULT_COST, unknown: true }` and computeCostUsd records cost_usd=null (+ warn once) instead of a fabricated guess, so the bandit/cost-optimizer aren't poisoned. - Add MODEL_PRICE_OVERRIDES env so operators can pin prices the registry lacks. - Each result carries a `resolution` tag for debuggability. Adds test/model-registry-cost.test.js (incl. a regression test that a name containing a real key as a substring no longer false-matches). Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 5 ++ package.json | 2 +- src/clients/databricks.js | 13 +++- src/routing/model-registry.js | 115 ++++++++++++++++++++++++------- test/model-registry-cost.test.js | 50 ++++++++++++++ 5 files changed, 157 insertions(+), 28 deletions(-) create mode 100644 test/model-registry-cost.test.js diff --git a/.env.example b/.env.example index b98ca72..6a4e090 100644 --- a/.env.example +++ b/.env.example @@ -445,6 +445,11 @@ TOON_MIN_BYTES=4096 TOON_FAIL_OPEN=true TOON_LOG_STATS=true +# Model price overrides: pin per-1M-token USD prices for models the pricing +# registry doesn't know (otherwise their cost is recorded as null/unknown). +# JSON object keyed by model name. Example: +# MODEL_PRICE_OVERRIDES={"my-model":{"input":0.5,"output":1.5}} + # Caveman terse-output injection (opt-in): append a brevity instruction to the # system prompt to reduce OUTPUT tokens. Off by default — changes model style. # Levels: lite | full | ultra diff --git a/package.json b/package.json index ea00306..9d0305a 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ "dev": "nodemon index.js", "lint": "eslint src index.js", "test": "npm run test:unit && npm run test:performance", - "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/resilience.test.js test/telemetry-routing.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js test/distill.test.js test/large-payload.test.js test/code-mode.test.js test/prompt-cache-injection.test.js test/risk-analyzer.test.js test/interaction-block.test.js test/preflight.test.js test/token-reduction.test.js test/session-affinity.test.js", + "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/resilience.test.js test/telemetry-routing.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js test/distill.test.js test/large-payload.test.js test/code-mode.test.js test/prompt-cache-injection.test.js test/risk-analyzer.test.js test/interaction-block.test.js test/preflight.test.js test/token-reduction.test.js test/session-affinity.test.js test/model-registry-cost.test.js", "test:memory": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js", "test:new-features": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js", "test:performance": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/hybrid-routing-performance.test.js && DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/performance-tests.js", diff --git a/src/clients/databricks.js b/src/clients/databricks.js index f72302d..5d31c79 100644 --- a/src/clients/databricks.js +++ b/src/clients/databricks.js @@ -2044,12 +2044,23 @@ async function invokeCodex(body) { * Registry returns per-1M-token prices ({ input, output }); returns null when * pricing is unknown so we don't record misleading zeros. */ +const _unknownCostWarned = new Set(); function computeCostUsd(model, inputTokens, outputTokens) { try { const { getModelRegistrySync } = require("../routing/model-registry"); const reg = getModelRegistrySync && getModelRegistrySync(); const cost = reg?.getCost?.(model); - if (!cost || (cost.input == null && cost.output == null)) return null; + if (!cost) return null; + // Unknown model → record null (not a fabricated default), warn once so the + // gap is visible and can be fixed via MODEL_PRICE_OVERRIDES. + if (cost.unknown) { + if (model && !_unknownCostWarned.has(model)) { + _unknownCostWarned.add(model); + logger.warn({ model }, "[Cost] No pricing for model — recording cost_usd=null. Set MODEL_PRICE_OVERRIDES to fix."); + } + return null; + } + if (cost.input == null && cost.output == null) return null; const inUsd = ((inputTokens || 0) / 1e6) * (cost.input || 0); const outUsd = ((outputTokens || 0) / 1e6) * (cost.output || 0); return Number((inUsd + outUsd).toFixed(6)); diff --git a/src/routing/model-registry.js b/src/routing/model-registry.js index e52258b..ac87804 100644 --- a/src/routing/model-registry.js +++ b/src/routing/model-registry.js @@ -54,9 +54,41 @@ const DATABRICKS_FALLBACK = { 'databricks-bge-large-en': { input: 0.02, output: 0, context: 512 }, }; -// Default cost for unknown models +// Default cost for unknown models. Returned with `unknown: true` so callers can +// distinguish a real price from a fabricated guess. const DEFAULT_COST = { input: 1.0, output: 3.0, context: 128000 }; +// Curated name aliases (exact, one-directional). Maps a name a caller might use +// to the canonical key likely present in the pricing data. Misses are harmless +// (resolution simply continues down the ladder). +const MODEL_ALIASES = { + 'claude-sonnet-4-5': 'claude-sonnet-4-5-20250929', + 'claude-opus-4-1': 'claude-opus-4-1-20250805', + 'claude-3-5-sonnet': 'claude-3-5-sonnet-20241022', +}; + +/** + * Parse MODEL_PRICE_OVERRIDES env (JSON object of + * { "": { "input": , "output": , "context"?: N } }). + * Lets operators pin correct prices for models the registry doesn't know. + */ +function _loadOverrides() { + const out = new Map(); + const raw = process.env.MODEL_PRICE_OVERRIDES; + if (!raw) return out; + try { + const parsed = JSON.parse(raw); + for (const [name, info] of Object.entries(parsed)) { + if (info && typeof info.input === 'number' && typeof info.output === 'number') { + out.set(name.toLowerCase(), { context: 128000, ...info }); + } + } + } catch (err) { + logger.warn({ err: err.message }, '[ModelRegistry] Failed to parse MODEL_PRICE_OVERRIDES'); + } + return out; +} + class ModelRegistry { constructor() { this.litellmPrices = {}; @@ -64,6 +96,7 @@ class ModelRegistry { this.loaded = false; this.lastFetch = 0; this.modelIndex = new Map(); + this.overrides = _loadOverrides(); } /** @@ -255,40 +288,70 @@ class ModelRegistry { * @returns {Object} Cost info { input, output, context, ... } */ getCost(modelName) { - if (!modelName) return { ...DEFAULT_COST, source: 'default' }; + if (!modelName) return { ...DEFAULT_COST, source: 'default', unknown: true }; - const normalizedName = modelName.toLowerCase(); + const name = String(modelName).toLowerCase().trim(); + const hit = this._resolveCost(name); + if (hit) return hit; - // Direct lookup - if (this.modelIndex.has(normalizedName)) { - return this.modelIndex.get(normalizedName); - } + // Nothing matched — report unknown rather than silently fabricating a price. + logger.debug({ model: modelName }, '[ModelRegistry] Model not found — cost unknown'); + return { ...DEFAULT_COST, source: 'default', unknown: true }; + } - // Try common variations - const variations = [ - normalizedName, - normalizedName.replace('databricks-', ''), - normalizedName.replace('azure/', ''), - normalizedName.replace('bedrock/', ''), - normalizedName.replace('anthropic.', ''), - normalizedName.split('/').pop(), - ]; - - for (const variant of variations) { - if (this.modelIndex.has(variant)) { - return this.modelIndex.get(variant); - } + /** + * Deterministic price resolution. Each step is exact (no bidirectional + * substring matching), and the only loose step (longest-prefix) is + * one-directional and length-bounded, so unrelated names can't false-match. + * Returns a cost object with a `resolution` tag, or null if nothing matched. + * @param {string} name - already lowercased/trimmed + */ + _resolveCost(name) { + const tag = (value, resolution, matchedAs) => ({ + ...value, + resolution, + ...(matchedAs && matchedAs !== name ? { matchedAs } : {}), + }); + + // 1. Operator overrides (exact) — ground truth. + if (this.overrides.has(name)) return tag({ ...this.overrides.get(name), source: 'override' }, 'override'); + + // 2. Exact registry hit. + if (this.modelIndex.has(name)) return tag(this.modelIndex.get(name), 'exact'); + + // 3. Provider-prefix strip (exact). + const stripped = [ + name.replace(/^databricks-/, ''), + name.replace(/^azure\//, ''), + name.replace(/^bedrock\//, ''), + name.replace(/^anthropic\./, ''), + name.replace(/^openai\//, ''), + name.includes('/') ? name.split('/').pop() : null, + ].filter((v) => v && v !== name); + for (const v of stripped) { + if (this.overrides.has(v)) return tag({ ...this.overrides.get(v), source: 'override' }, 'prefix-strip', v); + if (this.modelIndex.has(v)) return tag(this.modelIndex.get(v), 'prefix-strip', v); } - // Fuzzy match for partial names + // 4. Curated alias (exact). + const alias = MODEL_ALIASES[name]; + if (alias && this.modelIndex.has(alias)) return tag(this.modelIndex.get(alias), 'alias', alias); + + // 5. Date/version-suffix normalization (e.g. -20250929, -2025-09-29, -v2). + const dateless = name.replace(/[-@](\d{8}|\d{4}-\d{2}-\d{2}|v\d+)$/, ''); + if (dateless !== name && this.modelIndex.has(dateless)) return tag(this.modelIndex.get(dateless), 'date-normalize', dateless); + + // 6. Longest registry key that is a prefix of the requested name. Bounded so + // short keys can't grab unrelated names (e.g. "gpt-5.2-chat-2026" → "gpt-5.2-chat"). + let best = null; for (const [key, value] of this.modelIndex.entries()) { - if (key.includes(normalizedName) || normalizedName.includes(key)) { - return value; + if (key.length >= 6 && name.startsWith(key) && (!best || key.length > best.key.length)) { + best = { key, value }; } } + if (best) return tag(best.value, 'longest-prefix', best.key); - logger.debug({ model: modelName }, '[ModelRegistry] Model not found, using default'); - return { ...DEFAULT_COST, source: 'default' }; + return null; } /** diff --git a/test/model-registry-cost.test.js b/test/model-registry-cost.test.js new file mode 100644 index 0000000..d0836cd --- /dev/null +++ b/test/model-registry-cost.test.js @@ -0,0 +1,50 @@ +const assert = require("assert"); +const { describe, it } = require("node:test"); + +const { getModelRegistrySync } = require("../src/routing/model-registry"); + +const reg = getModelRegistrySync(); + +describe("model-registry cost resolution ladder", () => { + it("resolves a known model exactly", () => { + const c = reg.getCost("gpt-5.2-chat"); + assert.strictEqual(c.unknown, undefined); + assert.ok(c.input > 0 && c.output > 0); + }); + + it("strips a provider prefix to resolve", () => { + const c = reg.getCost("databricks-claude-sonnet-4-5"); + assert.ok(!c.unknown); + assert.ok(c.input > 0); + }); + + it("matches a dated/suffixed name via longest-prefix", () => { + const base = reg.getCost("gpt-5.2-chat"); + const suffixed = reg.getCost("gpt-5.2-chat-2026"); + assert.ok(!suffixed.unknown); + assert.strictEqual(suffixed.input, base.input); + assert.strictEqual(suffixed.matchedAs, "gpt-5.2-chat"); + }); + + it("returns unknown (not a fabricated price) for a garbage name", () => { + const c = reg.getCost("totally-made-up-model-xyz"); + assert.strictEqual(c.unknown, true); + assert.strictEqual(c.resolution, undefined); + }); + + it("does not false-match a too-short name", () => { + assert.strictEqual(reg.getCost("xx").unknown, true); + }); + + it("treats empty/missing model as unknown", () => { + assert.strictEqual(reg.getCost("").unknown, true); + assert.strictEqual(reg.getCost(null).unknown, true); + }); + + it("never does a bidirectional substring match (the old fuzzy hazard)", () => { + // A name that contains a real key as a *substring* but not as a prefix must + // NOT resolve to that key. + const c = reg.getCost("my-custom-gpt-5.2-chat-wrapper"); + assert.strictEqual(c.unknown, true); + }); +}); From 7e7b6433609f1eabfe7ca787468ba52a2a549a94 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Tue, 9 Jun 2026 12:30:20 -0700 Subject: [PATCH 07/10] docs: sync benchmark numbers from BENCHMARK_REPORT, document new features - Benchmark Results: switch to the Lynkr-vs-LiteLLM numbers from BENCHMARK_REPORT.md (53% / 87.6% token reduction with cost, 171ms cache, tier-routing comparison, ~50% monthly cost projection) + run date/versions. - Advanced Features: document always-on optimizations (smart tool selection, RTK compression, MCP tool dedup, request bypass), opt-in caveman terse mode, and the cost-tracking / MODEL_PRICE_OVERRIDES behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 60 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 9935f54..765b430 100644 --- a/README.md +++ b/README.md @@ -545,6 +545,28 @@ TOOL_INJECTION_ENABLED=false CODE_MODE_ENABLED=true ``` +Always-on (no config): **smart tool selection** (server mode), **RTK tool-result +compression** (test/git/grep/lint/build/JSON output), **MCP tool dedup** (drops +built-in WebSearch/WebFetch when an Exa/Tavily MCP tool is present), and +**request bypass** (Claude CLI Warmup / title-extraction calls are answered +locally, never hitting a provider). + +Optional **terse-output mode** to cut *output* tokens: +```bash +CAVEMAN_ENABLED=true # off by default — nudges the model to be concise +CAVEMAN_LEVEL=lite # lite | full | ultra +``` + +### Cost tracking & model pricing +Per-request cost is computed from a model-pricing registry (LiteLLM → models.dev, +cached 24h) and recorded in telemetry. Models the registry doesn't know record +`cost_usd=null` (logged once) rather than a fabricated price. Pin prices for +unknown models: +```bash +# Per-1M-token USD prices, JSON keyed by model name +MODEL_PRICE_OVERRIDES={"my-model":{"input":0.5,"output":1.5}} +``` + ### Memory System (Titans-inspired) ```bash MEMORY_ENABLED=true @@ -652,35 +674,45 @@ npm start ## Benchmark Results -Measured on real agentic coding workloads (Claude Code / Cursor sessions) with Ollama, Moonshot, and Azure OpenAI backends. Run with `node benchmark-tier-routing.js`. +Head-to-head against **LiteLLM** on the **same backends** (Ollama `minimax-m2.5`, Moonshot, Azure OpenAI), 9 scenarios across 4 feature categories. Apples-to-apples comparison is Lynkr vs LiteLLM **billed tokens on the same scenario**. Run with `node benchmark-tier-routing.js`. -### Token compression +> _Run: June 5, 2026 · Lynkr v9.3.2 · LiteLLM v1.87.1 · macOS, Apple Silicon._ -| Scenario | Tokens without Lynkr | Tokens with Lynkr | Reduction | +### Token reduction (vs LiteLLM, same model & prompt) + +| Mechanism | Lynkr | LiteLLM | Result | |---|---|---|---| -| 14-tool request (read task) | 1,042 | **547** | **47%** | -| 14-tool request (write task) | 1,043 | **412** | **60%** | -| Large JSON grep result (60 items) | 3,458 | **427** | **87.6%** | +| Smart tool selection (14 tools) | **959** tokens · $0.0044 | 2,085 tokens · $0.0091 | **53% fewer tokens, 52% cheaper** | +| TOON compression (60-item grep JSON) | **427** tokens · $0.009 | 3,458 tokens · $0.018 | **87.6% fewer tokens, 50% cheaper** | -Lynkr strips irrelevant tool schemas before forwarding (smart tool selection) and binary-compresses large JSON tool results (TOON) — both happen in-process with no added latency. +Lynkr strips irrelevant tool schemas (smart tool selection) and binary-compresses large JSON tool results (TOON) — both in-process, no added latency. ### Semantic cache | | Tokens billed | Response time | |---|---|---| | First call (cold) | 2,857 | 1,891ms | -| **Second call — paraphrased, cache hit** | **0** | **171ms** | +| **Second call — paraphrased, cache hit** | **0** (served from cache) | **171ms (11× faster)** | -Near-identical prompts return cached responses in 171ms. Zero tokens billed on a cache hit. +Near-identical prompts return cached responses in 171ms. Zero model tokens billed on a cache hit. ### Tier routing -| Request | Routed to | -|---|---| -| "What does git stash do?" | SIMPLE → local model (free) | -| JWT vs cookies security analysis | COMPLEX → cloud model (correct) | +| Request | Lynkr routes to | LiteLLM routes to | +|---|---|---| +| "What does git stash do?" | `minimax-m2.5` (local, free) | Ollama (local) | +| JWT vs cookies security analysis | `moonshot` (cloud — correct) | **Ollama (local — wrong call)** | + +Lynkr scores each request on 15 dimensions (token count, code complexity, reasoning markers, risk signals, agentic patterns) and escalates automatically. LiteLLM's `cost-based-routing` sends everything to the cheapest model regardless of complexity. + +### Cost projection (100,000 requests/month, same backend) + +| | Monthly cost | vs LiteLLM | +|---|---|---| +| LiteLLM | ~$818 | baseline | +| **Lynkr** | **~$409** | **~50% cheaper** | -Lynkr scores each request on 15 dimensions (token count, code complexity, reasoning markers, risk signals, agentic patterns) and routes automatically. No caller changes needed. +_Based on a tool-heavy agentic session (TOON scenario). On equal footing — same provider, same model — Lynkr is cheaper due to token optimization._ → [Full benchmark report with methodology](BENCHMARK_REPORT.md) From 380d905b52f6943f881c07c049ff493530dddb6a Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Tue, 9 Jun 2026 12:32:50 -0700 Subject: [PATCH 08/10] =?UTF-8?q?docs(token-optimization):=20add=20Phase?= =?UTF-8?q?=207=20=E2=80=94=20tool-result=20compression=20(RTK=20+=20TOON)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the in-process tool_result compression that was previously undocumented: - RTK pattern compressors (test/git/grep/lint/build/container/json/dir/file + dedup_log/smart_truncate), tier-aware size thresholds, and tee lossless recovery via GET /tee/:id. - TOON binary JSON encoding with its config (TOON_ENABLED/MIN_BYTES/FAIL_OPEN). Adds an overview row and fixes the phase-count labels. Co-Authored-By: Claude Opus 4.8 (1M context) --- documentation/token-optimization.md | 57 ++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/documentation/token-optimization.md b/documentation/token-optimization.md index 8e19a93..3fcaf4c 100644 --- a/documentation/token-optimization.md +++ b/documentation/token-optimization.md @@ -12,6 +12,7 @@ Lynkr reduces tokens sent to the model through multiple independent mechanisms. |---|---|---| | **Smart tool selection** | **47–60%** | 14-tool request (read or write task) | | **TOON JSON compression** | **87.6%** | Large grep/file-read tool result (60-item array) | +| **Tool-result compression (RTK)** | up to **87.6%** | grep/test/git/lint/build/log/JSON tool output | | **Semantic cache** | **100% on hit, 171ms** | Paraphrased repeat query | | MCP Code Mode | **96%** | 100+ MCP tool schemas → 4 meta-tools | | History compression | up to 80% | Long multi-turn sessions | @@ -45,7 +46,7 @@ At 100,000 requests/month on a tool-heavy agentic workload, this translates to * --- -## 7 Optimization Phases +## Optimization Phases ### Phase 0: MCP Code Mode (96% reduction for MCP tools) @@ -283,6 +284,58 @@ HISTORY_SUMMARIZE_OLDER=true # Summarize older turns (default: true) --- +### Phase 7: Tool-Result Compression (up to 87.6% on tool output) + +**Problem:** Tool results dominate agentic token usage. A single `grep`, test run, `git diff`, or JSON API response can be thousands of tokens — most of it boilerplate the model doesn't need to reason over. + +Lynkr compresses `tool_result` blocks **in-process before forwarding** (no added latency), via two complementary mechanisms. + +#### 7a. RTK pattern compression + +Detects the *shape* of a tool result and rewrites it to a compact, information-preserving summary. Each detector only fires when it recognizes the format; unrecognized text passes through unchanged. + +| Detector | What it compresses | Example outcome | +|----------|--------------------|-----------------| +| `test_output` | jest/vitest/pytest/cargo/go test logs | Keep the summary line + failures, drop passing-test noise | +| `git_diff` | `git diff` | Per-file `+adds/-dels` with capped change lines | +| `git_status` | `git status` | Branch + staged/modified/untracked lists | +| `git_log` | `git log` | One line per commit (` (author, date)`) | +| `lint_output` | eslint/tsc/ruff/clippy/biome | Counts grouped by rule, not every occurrence | +| `build_output` | npm/cargo/webpack | Errors + capped warnings + success line | +| `container_output` | docker/kubectl tables | Header + first N rows + “+M more” | +| `json_response` | large JSON objects | Structural skeleton (search/fetch results preserved) | +| `grep_output` | `grep`/`rg` (`file:line:content`) | Grouped by file, capped at 10 matches/file | +| `directory_listing` | `ls`/`find`/`tree` | Grouped by directory with counts | +| `large_file` | long source files | Imports + signatures skeleton | +| `dedup_log` | repetitive logs | Collapses consecutive duplicate lines | +| `smart_truncate` | very long unmatched output | Keeps head + tail, drops the middle | + +**Tier-aware thresholds** — compression only kicks in above a size that scales with the routing tier, so cheap models get aggressive compression and reasoning models get the full picture: + +| Tier | Compress if result exceeds | +|------|----------------------------| +| SIMPLE | 300 chars | +| MEDIUM | 800 chars | +| COMPLEX | 2,000 chars | +| REASONING | never | + +**Lossless recovery (tee):** the full original is stashed for 5 minutes and a pointer (`[full: tee_…]`) is appended to the compressed result. The model — or you — can fetch the original via `GET /tee/:id` if the detail is actually needed. + +Always on (no configuration). Metrics: `GET /metrics/tool-compression`. + +#### 7b. TOON compression (binary JSON encoding) + +For large JSON tool results (arrays of objects, API payloads), TOON re-encodes the structure into a far denser representation than pretty-printed JSON — **87.6% reduction** on a 60-item grep array in benchmarks. Plain text and small payloads are left untouched. + +```bash +TOON_ENABLED=true # opt-in (default: false) +TOON_MIN_BYTES=4096 # only compress payloads larger than this +TOON_FAIL_OPEN=true # on any encode error, forward the original (default: true) +TOON_LOG_STATS=true # log per-call compression stats +``` + +--- + ### Phase 8: Headroom Context Compression (Optional, 47-92% reduction) **Problem:** Even with all other optimizations, large requests can still exceed context limits. @@ -308,7 +361,7 @@ HEADROOM_ENABLED=true ## Combined Savings -When all 8 phases work together: +When all phases work together: **Example Request Flow:** From 3c41c4fabf184d4689981f323eab85b24d4f7cd4 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Tue, 9 Jun 2026 13:59:16 -0700 Subject: [PATCH 09/10] chore(install): fix port to 8081, surface native-module status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Correct the curl installer's printed port from 8080 to 8081 (matches .env.example/README; the installer copies .env.example which uses 8081, so the old instructions were wrong for real installs). - `npm install --production` → `--omit=dev` (keeps optionalDependencies; the postinstall native-ABI guard runs here). - After install, probe better-sqlite3 and tell the user whether telemetry / memory / sessions are enabled, with a `npm run rebuild-native` hint if the native module isn't loadable. Lynkr still runs without it. Co-Authored-By: Claude Opus 4.8 (1M context) --- install.sh | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/install.sh b/install.sh index bf34dfc..d51c235 100755 --- a/install.sh +++ b/install.sh @@ -108,8 +108,24 @@ clone_or_update() { install_dependencies() { print_info "Installing dependencies..." cd "$INSTALL_DIR" - npm install --production + # --omit=dev keeps optionalDependencies (better-sqlite3, hnswlib-node, + # tree-sitter) which back telemetry, the memory store and routing ML. + # The postinstall hook (scripts/check-native.js) verifies the native ABI + # and rebuilds if Node was upgraded — best-effort, never fails the install. + npm install --omit=dev print_success "Dependencies installed" + + # Native optional modules need a C/C++ toolchain only if no prebuilt binary + # is available for this platform. They degrade gracefully if absent. + if ! node -e "const D=require('better-sqlite3'); new D(':memory:').close()" >/dev/null 2>&1; then + print_warning "Native module 'better-sqlite3' is not loadable." + echo " Telemetry, the memory store and sessions need it. To enable:" + echo " - Ensure a build toolchain is present (Xcode CLT on macOS, build-essential + python3 on Linux), then:" + echo " - ${BLUE}cd $INSTALL_DIR && npm run rebuild-native${NC}" + echo " Lynkr still runs without it (those features stay disabled)." + else + print_success "Native modules OK (telemetry, memory, sessions enabled)" + fi } # Create default .env file @@ -131,7 +147,7 @@ create_env_file() { MODEL_PROVIDER=ollama # Server Configuration -PORT=8080 +PORT=8081 # Ollama Configuration (default for local development) OLLAMA_MODEL=qwen2.5-coder:7b @@ -161,7 +177,7 @@ EOF print_info "📝 Configuration ready! Key settings:" echo " • Default provider: Ollama (local, offline)" echo " • Memory system: Enabled (learns from conversations)" - echo " • Port: 8080" + echo " • Port: 8081" echo "" print_warning "To use cloud providers (Databricks/OpenAI/Azure):" echo " Edit: ${BLUE}nano $INSTALL_DIR/.env${NC}" @@ -220,7 +236,7 @@ print_next_steps() { echo " ${BLUE}lynkr${NC}" echo "" echo " 3. Configure Claude Code CLI:" - echo " ${BLUE}export ANTHROPIC_BASE_URL=http://localhost:8080${NC}" + echo " ${BLUE}export ANTHROPIC_BASE_URL=http://localhost:8081${NC}" echo " ${BLUE}claude${NC}" echo "" echo " ${YELLOW}Option B: Use Cloud Providers (Databricks/OpenAI/Azure)${NC}" @@ -238,7 +254,7 @@ print_next_steps() { echo " ${BLUE}lynkr${NC}" echo "" echo " 3. Configure Claude Code CLI:" - echo " ${BLUE}export ANTHROPIC_BASE_URL=http://localhost:8080${NC}" + echo " ${BLUE}export ANTHROPIC_BASE_URL=http://localhost:8081${NC}" echo " ${BLUE}export ANTHROPIC_API_KEY=any-non-empty-value${NC} ${GREEN}← Placeholder${NC}" echo " ${BLUE}claude${NC}" echo "" From 97c5e0b5c42cb4de0f8764208eda98e077209d58 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Tue, 9 Jun 2026 22:49:51 -0700 Subject: [PATCH 10/10] docs(homepage): bump displayed version to 9.4.6 (latest npm) Update the GitHub Pages homepage version indicators (JSON-LD softwareVersion and the hero badge) from 9.3.2 to 9.4.6 to match the latest npm release. The benchmark-provenance line is left at v9.3.2 since that records the version the published benchmarks were actually run on. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/index.html | 4 ++-- docs/index.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.html b/docs/index.html index 1c2c025..66570b6 100644 --- a/docs/index.html +++ b/docs/index.html @@ -34,7 +34,7 @@ "description": "Self-hosted LLM gateway for Claude Code, Cursor, and Codex. Compresses tokens before they hit the model.", "url": "https://github.com/Fast-Editor/Lynkr", "downloadUrl": "https://www.npmjs.com/package/lynkr", - "softwareVersion": "9.3.2", + "softwareVersion": "9.4.6", "author": { "@type": "Person", "name": "Vishal Veera Reddy", "url": "https://github.com/vishalveerareddy123" }, "offers": { "@type": "Offer", "price": "0", "priceCurrency": "USD" }, "keywords": "LLM gateway, Claude Code, Cursor, Ollama, AWS Bedrock, AI coding, self-hosted" @@ -72,7 +72,7 @@
-
v9.3.2 — benchmarked in production
+
v9.4.6 — benchmarked in production

The LLM gateway
diff --git a/docs/index.md b/docs/index.md index 4457b3c..02aa934 100644 --- a/docs/index.md +++ b/docs/index.md @@ -50,7 +50,7 @@ "description": "Self-hosted LLM gateway server that enables Claude Code, Cursor, and AI coding tools to work with any LLM provider with 60-80% cost reduction.", "url": "https://github.com/Fast-Editor/Lynkr", "downloadUrl": "https://www.npmjs.com/package/lynkr", - "softwareVersion": "9.3.2", + "softwareVersion": "9.4.6", "author": { "@type": "Person", "name": "Vishal Veera Reddy", @@ -107,7 +107,7 @@
- v9.3.2 — Production Ready + v9.4.6 — Production Ready