From 569ee080e7903d29ee12e5b69b351074a964c778 Mon Sep 17 00:00:00 2001 From: oratis Date: Fri, 29 May 2026 23:48:38 +0800 Subject: [PATCH 1/2] chore(tooling): enforce lint in pre-commit + ignore Rust build output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eslint was scanning apps/desktop/src-tauri/target/ (generated Cargo JS), producing 49 spurious errors locally — CI only escaped because it lints before cargo build. Add target/ + release-artifacts/ to the ignore list, then wire `pnpm lint` into the pre-commit hook so lint regressions are caught before they land instead of only in CI. Co-Authored-By: Claude Opus 4.8 (1M context) --- .husky/pre-commit | 3 +++ eslint.config.js | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.husky/pre-commit b/.husky/pre-commit index 6cc60ec..0987320 100755 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -4,6 +4,9 @@ set -e +echo "▎ pre-commit: lint" +pnpm lint + echo "▎ pre-commit: typecheck" pnpm -r typecheck diff --git a/eslint.config.js b/eslint.config.js index 811a7b6..abf5eef 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -13,7 +13,9 @@ export default [ '**/dist/**', '**/dist-electron/**', '**/node_modules/**', + '**/target/**', // Rust/Cargo build output (generated JS in src-tauri/target) '**/.tsbuildinfo', + 'release-artifacts/**', 'apps/desktop/electron/**', // requires electron types — pending M6-rest ], }, From 0af6ba1470c84c8684ac38fb24b134b6795b078b Mon Sep 17 00:00:00 2001 From: oratis Date: Fri, 29 May 2026 23:48:57 +0800 Subject: [PATCH 2/2] fix(core): agent-loop & provider correctness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent fixes in the core engine: 1. Auto-compact fired every turn once over the threshold. `shouldCompact` read the cumulative `totalUsage` (summed across all turns, never reset), so it both triggered far too early (each turn re-sends the whole history, inflating the sum) and, once over the line, re-compacted the already- compacted history on every subsequent turn. Use the latest turn's `result.usage.inputTokens` — the true current-context size — instead. 2. Read-only tools now execute concurrently. The per-turn tool loop awaited each call sequentially; Read/Grep/Glob/WebFetch/WebSearch are side-effect- free and independent, so they run via Promise.all while mutating tools (Edit/Write/Bash/…) stay sequential to preserve snapshot ordering and one-at-a-time approval prompts. Results are reassembled in the model's original order via tool_use_id. 3. A max_tokens-truncated tool call is no longer executed as garbage. When the completion is cut off mid-arguments the partial JSON won't parse and `input` became {} — e.g. a Write with no file_path. The DeepSeek provider now drops such calls and reports stopReason 'max_tokens' so the loop ends cleanly instead of executing a malformed call. Adds regression tests for all three. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/core/src/agent.test.ts | 151 +++++++++++++++++++ packages/core/src/agent.ts | 92 ++++++++--- packages/core/src/providers/deepseek.test.ts | 66 ++++++++ packages/core/src/providers/deepseek.ts | 34 ++++- 4 files changed, 311 insertions(+), 32 deletions(-) diff --git a/packages/core/src/agent.test.ts b/packages/core/src/agent.test.ts index 5d3a019..850d099 100644 --- a/packages/core/src/agent.test.ts +++ b/packages/core/src/agent.test.ts @@ -278,6 +278,157 @@ describe('runAgent', () => { } }); + it('runs multiple read-only tool calls concurrently and preserves result order', async () => { + const events2: string[] = []; + const delay = (ms: number) => new Promise((r) => setTimeout(r, ms)); + const slowReadOnly = (name: string) => ({ + name, + definition: { name, description: name, inputSchema: { type: 'object', properties: {} } }, + async execute() { + events2.push(`start:${name}`); + await delay(20); + events2.push(`end:${name}`); + return { content: `${name} done` }; + }, + }); + // Custom registry with two read-only-named tools (Grep + Glob ∈ READ_ONLY_TOOLS). + const tools = new ToolRegistry([ + slowReadOnly('Grep'), + slowReadOnly('Glob'), + ] as unknown as Parameters[0][]); + + const provider = new MockProvider([ + { + content: [ + { type: 'text', text: 'searching' }, + { type: 'tool_use', id: 'g1', name: 'Grep', input: {} }, + { type: 'tool_use', id: 'g2', name: 'Glob', input: {} }, + ], + stopReason: 'tool_use', + usage: { inputTokens: 1, outputTokens: 1, reasoningTokens: 0, cacheReadTokens: 0 }, + }, + endTurn('done'), + ]); + + const result = await runAgent({ + provider, + tools, + systemPrompt: '', + userMessage: 'find things', + model: 'deepseek-chat', + cwd, + }); + + // Concurrency: both tools start before either finishes. + expect(events2.slice(0, 2).every((e) => e.startsWith('start:'))).toBe(true); + expect(events2.slice(2).every((e) => e.startsWith('end:'))).toBe(true); + + // Result order matches the model's call order (Grep then Glob) regardless of + // which promise settled first. + const toolResultMsg = result.history[2]!; // user msg with tool_result blocks + expect(toolResultMsg.role).toBe('user'); + const ids = toolResultMsg.content + .filter((b): b is Extract => b.type === 'tool_result') + .map((b) => b.tool_use_id); + expect(ids).toEqual(['g1', 'g2']); + }); + + it('does not auto-compact on cumulative usage when each turn is below threshold', async () => { + // Regression: shouldCompact must use the *current* turn's input tokens, not + // the cumulative sum across turns. contextWindow 100, threshold 0.8 → trigger + // at 80. Each turn reports inputTokens 30 (below 80), so the per-turn proxy + // never crosses — but the cumulative sum (30+30+30=90) would, under the old + // buggy logic, fire compaction on turn 3. Assert it never fires. + await fs.writeFile(join(cwd, 'x.txt'), 'data'); + + // A provider that counts how many times the compaction summarizer runs + // (identified by the compaction system prompt + empty tool list). + let summarizerCalls = 0; + const turn = (): ProviderResult => ({ + content: withToolCall('working', { + type: 'tool_use', + id: `c${Math.random()}`, + name: 'Read', + input: { file_path: 'x.txt' }, + }), + stopReason: 'tool_use', + usage: { inputTokens: 30, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, + }); + const scripted: ProviderResult[] = [turn(), turn(), endTurn('done')]; + const countingProvider: Provider = { + name: 'counting', + async runTurn(opts: ProviderRunOpts): Promise { + if (opts.systemPrompt.startsWith('You compress long agent conversations')) { + summarizerCalls++; + return endTurn('summary'); + } + const next = scripted.shift(); + if (!next) throw new Error('no scripted response'); + return next; + }, + }; + + const result = await runAgent({ + provider: countingProvider, + tools: new ToolRegistry(), + systemPrompt: 'agent', + userMessage: 'go', + model: 'deepseek-chat', + cwd, + autoCompact: { contextWindow: 100, threshold: 0.8 }, + }); + + expect(result.stopReason).toBe('end_turn'); + expect(summarizerCalls).toBe(0); + }); + + it('auto-compacts once when a single turn crosses the threshold', async () => { + // Inverse of the above: when the *current* turn's input alone exceeds the + // threshold (90 > 80), compaction should fire. History after one tool turn + // is short, so compact() keeps it verbatim, but the summarizer is still + // invoked — proving the trigger path is live. + await fs.writeFile(join(cwd, 'x.txt'), 'data'); + let summarizerCalls = 0; + const scripted: ProviderResult[] = [ + { + content: withToolCall('working', { + type: 'tool_use', + id: 'big', + name: 'Read', + input: { file_path: 'x.txt' }, + }), + stopReason: 'tool_use', + usage: { inputTokens: 90, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, + }, + endTurn('done'), + ]; + const provider: Provider = { + name: 'counting', + async runTurn(opts: ProviderRunOpts): Promise { + if (opts.systemPrompt.startsWith('You compress long agent conversations')) { + summarizerCalls++; + return endTurn('summary'); + } + const next = scripted.shift(); + if (!next) throw new Error('no scripted response'); + return next; + }, + }; + + await runAgent({ + provider, + tools: new ToolRegistry(), + systemPrompt: 'agent', + userMessage: 'go', + model: 'deepseek-chat', + cwd, + // Tiny keep window so compact() doesn't short-circuit on the short history. + autoCompact: { contextWindow: 100, threshold: 0.8, keepFirstPairs: 0, keepLastMessages: 1 }, + }); + + expect(summarizerCalls).toBe(1); + }); + it('honors systemReminders: false to skip injection entirely', async () => { const provider = new MockProvider([endTurn('hi')]); const tools = new ToolRegistry(); diff --git a/packages/core/src/agent.ts b/packages/core/src/agent.ts index ae58f9e..d3e1891 100644 --- a/packages/core/src/agent.ts +++ b/packages/core/src/agent.ts @@ -101,6 +101,14 @@ export interface RunAgentResult { const DEFAULT_MAX_TURNS = 16; +/** + * Tools with no side effects whose results don't depend on each other — safe to + * execute concurrently within a single turn. Everything else (Edit/Write/Bash/ + * TodoWrite/AskUserQuestion/ExitPlanMode) runs sequentially to preserve snapshot + * ordering, mutation order, and one-at-a-time interactive prompts. + */ +const READ_ONLY_TOOLS = new Set(['Read', 'Grep', 'Glob', 'WebFetch', 'WebSearch']); + /** * Runs the agent loop until the model produces an end_turn (no tool calls), * or `maxTurns` is reached, or the abort signal fires. @@ -233,14 +241,27 @@ export async function runAgent(opts: RunAgentOptions): Promise { return { history, turnsUsed, usage: totalUsage, stopReason: 'end_turn', modeSignal }; } - // Execute tool calls and append a single user-role message with tool_result blocks - const toolResults: ToolResultBlock[] = []; - for (const block of result.content) { - if (block.type !== 'tool_use') continue; - const toolUse = block as ToolUseBlock; + // Execute tool calls and append a single user-role message with tool_result + // blocks. Two phases: + // 1. (sequential) resolve handler + permission for each call. Approval + // prompts must never overlap, so gating stays strictly ordered. + // 2. (mixed) execute. Side-effect-free reads run concurrently via + // Promise.all (the common "model emits 3 Reads at once" case); tools + // that mutate state / snapshot run sequentially to preserve ordering. + // tool_result blocks carry their tool_use_id, so the final array is + // re-assembled in the model's original order regardless of finish order. + const toolBlocks = result.content.filter( + (b): b is ToolUseBlock => b.type === 'tool_use', + ); + const resultsById = new Map(); + type Ready = { toolUse: ToolUseBlock; handler: NonNullable> }; + const ready: Ready[] = []; + + // Phase 1 — sequential gate + approval. + for (const toolUse of toolBlocks) { const handler = opts.tools.get(toolUse.name); if (!handler) { - toolResults.push({ + resultsById.set(toolUse.id, { type: 'tool_result', tool_use_id: toolUse.id, content: `Error: tool not found: ${toolUse.name}`, @@ -268,7 +289,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { allowed = decision === true || decision === 'always'; } if (!allowed) { - toolResults.push({ + resultsById.set(toolUse.id, { type: 'tool_result', tool_use_id: toolUse.id, content: `Tool call blocked: ${verdict.reason}`, @@ -287,12 +308,16 @@ export async function runAgent(opts: RunAgentOptions): Promise { } } - // Pre-execution snapshot (Edit/Write only) - if ( - opts.enableSnapshots !== false && - opts.session && - (toolUse.name === 'Edit' || toolUse.name === 'Write') - ) { + ready.push({ toolUse, handler }); + } + + // Runs one approved tool end-to-end: pre-snapshot, execute, PostToolUse + // hook, post-snapshot, event + result. Side-effect-free tools call this + // concurrently; mutating tools call it one at a time (see partition below). + const execOne = async ({ toolUse, handler }: Ready): Promise => { + const isFileMutation = toolUse.name === 'Edit' || toolUse.name === 'Write'; + + if (opts.enableSnapshots !== false && opts.session && isFileMutation) { const filePath = (toolUse.input as { file_path?: string }).file_path; if (filePath) { await opts.session.manager.snapshot({ @@ -327,13 +352,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { }); } - // Post-execution snapshot - if ( - opts.enableSnapshots !== false && - opts.session && - (toolUse.name === 'Edit' || toolUse.name === 'Write') && - !tr.isError - ) { + if (opts.enableSnapshots !== false && opts.session && isFileMutation && !tr.isError) { const filePath = (toolUse.input as { file_path?: string }).file_path; if (filePath) { await opts.session.manager.snapshot({ @@ -347,13 +366,26 @@ export async function runAgent(opts: RunAgentOptions): Promise { } opts.onEvent?.({ type: 'tool_result', id: toolUse.id, result: tr }); - toolResults.push({ + resultsById.set(toolUse.id, { type: 'tool_result', tool_use_id: toolUse.id, content: tr.content, is_error: tr.isError, }); - } + }; + + // Phase 2 — execute. Read-only tools have no side effects and don't touch + // snapshotSeq, so they're safe to run concurrently; everything else stays + // sequential to keep snapshot ordering deterministic. + const parallel = ready.filter((r) => READ_ONLY_TOOLS.has(r.toolUse.name)); + const serial = ready.filter((r) => !READ_ONLY_TOOLS.has(r.toolUse.name)); + await Promise.all(parallel.map(execOne)); + for (const r of serial) await execOne(r); + + // Re-assemble in the model's original tool-call order. + const toolResults: ToolResultBlock[] = toolBlocks + .map((b) => resultsById.get(b.id)) + .filter((r): r is ToolResultBlock => r !== undefined); const resultMsg: StoredMessage = { role: 'user', @@ -363,12 +395,22 @@ export async function runAgent(opts: RunAgentOptions): Promise { history.push(resultMsg); if (opts.session) await opts.session.manager.append(opts.session.id, resultMsg); - // M3c: auto-compact if usage crossed threshold + // M3c: auto-compact if the *current* context crossed the threshold. + // + // Use this turn's usage (result.usage), NOT the cumulative totalUsage. + // `result.usage.inputTokens` is exactly the size of the history we just + // sent to the model, so it is the true current-context proxy. Cumulative + // usage is wrong on two counts: it sums every turn's input (each turn + // re-sends the whole history, so it inflates far past the real window and + // crosses the threshold too early), and it never shrinks after a compaction + // — meaning once over the line it would re-compact the already-compacted + // history on every subsequent turn. The next turn's inputTokens naturally + // reflects the freshly-compacted (smaller) context, so this self-corrects. if ( opts.autoCompact && shouldCompact({ - inputTokens: totalUsage.inputTokens, - outputTokens: totalUsage.outputTokens, + inputTokens: result.usage.inputTokens, + outputTokens: result.usage.outputTokens, contextWindow: opts.autoCompact.contextWindow, threshold: opts.autoCompact.threshold, }) diff --git a/packages/core/src/providers/deepseek.test.ts b/packages/core/src/providers/deepseek.test.ts index 840acef..94603ae 100644 --- a/packages/core/src/providers/deepseek.test.ts +++ b/packages/core/src/providers/deepseek.test.ts @@ -139,6 +139,72 @@ describe('DeepSeekProvider', () => { expect(result.content[0].input).toEqual({ file_path: 'src/a.ts' }); } }); + + it('drops a tool call truncated by max_tokens instead of executing garbage', async () => { + // Model started a Write but the completion was cut off mid-arguments + // (finish_reason 'length'). The partial JSON won't parse. + const chunks = [ + { choices: [{ delta: { content: 'Creating the file' } }] }, + { + choices: [ + { + delta: { + tool_calls: [ + { index: 0, id: 'call_w', function: { name: 'Write', arguments: '{"file_path":"a.js","content":"cons' } }, + ], + }, + }, + ], + }, + { + choices: [{ delta: {}, finish_reason: 'length' }], + usage: { prompt_tokens: 10, completion_tokens: 3000 }, + }, + ]; + const p = new DeepSeekProvider({ apiKey: 'sk-test', fetch: mockFetch(chunks) }); + const result = await p.runTurn({ + model: 'deepseek-chat', + systemPrompt: '', + tools: [{ name: 'Write', description: '', inputSchema: { type: 'object', properties: {} } }], + messages: [{ role: 'user', content: [{ type: 'text', text: 'write it' }] }], + }); + // The malformed Write must NOT appear as an executable tool_use. + expect(result.content.some((b) => b.type === 'tool_use')).toBe(false); + // The partial text is preserved. + expect(result.content.some((b) => b.type === 'text')).toBe(true); + // The loop should stop, not try to execute the truncated call. + expect(result.stopReason).toBe('max_tokens'); + }); + + it('keeps a valid call but drops a truncated sibling in the same turn', async () => { + const chunks = [ + { + choices: [ + { + delta: { + tool_calls: [ + { index: 0, id: 'ok1', function: { name: 'Read', arguments: '{"file_path":"a.ts"}' } }, + { index: 1, id: 'bad', function: { name: 'Write', arguments: '{"file_path":"b.ts","content":"x' } }, + ], + }, + }, + ], + }, + { choices: [{ delta: {}, finish_reason: 'length' }], usage: { prompt_tokens: 5, completion_tokens: 8 } }, + ]; + const p = new DeepSeekProvider({ apiKey: 'sk-test', fetch: mockFetch(chunks) }); + const result = await p.runTurn({ + model: 'deepseek-chat', + systemPrompt: '', + tools: [], + messages: [{ role: 'user', content: [{ type: 'text', text: 'go' }] }], + }); + const toolUses = result.content.filter((b) => b.type === 'tool_use'); + expect(toolUses).toHaveLength(1); + expect(toolUses[0]?.type === 'tool_use' && toolUses[0].name).toBe('Read'); + // At least one valid call survived → still a tool_use turn. + expect(result.stopReason).toBe('tool_use'); + }); }); describe('DeepSeekProvider message conversion', () => { diff --git a/packages/core/src/providers/deepseek.ts b/packages/core/src/providers/deepseek.ts index d26ce5c..f0e3612 100644 --- a/packages/core/src/providers/deepseek.ts +++ b/packages/core/src/providers/deepseek.ts @@ -166,20 +166,40 @@ export class DeepSeekProvider implements Provider { if (text) { content.push({ type: 'text', text }); } + // Assemble tool calls, dropping any that were truncated mid-stream. When the + // completion hits max_tokens while emitting a tool call, the accumulated + // `args` is partial JSON that won't parse — executing it produces garbage + // (e.g. a Write with no file_path). A call with no name at all is a stray + // streaming artifact. Both are dropped; a genuinely empty-arg call (args === + // '' or '{}', e.g. ExitPlanMode) is kept and validated by the tool itself. + let validToolCalls = 0; + let truncatedToolCall = false; for (const call of toolCalls.values()) { - const toolUse: ToolUseBlock = { + if (!call.name) { + truncatedToolCall = true; + continue; + } + const parsed = safeParseJson(call.args); + if (call.args !== '' && parsed === null) { + truncatedToolCall = true; + continue; + } + content.push({ type: 'tool_use', id: call.id, name: call.name, - input: safeParseJson(call.args) ?? {}, - }; - content.push(toolUse); + input: parsed ?? {}, + } satisfies ToolUseBlock); + validToolCalls++; } + // stopReason precedence: only claim 'tool_use' when at least one call + // survived. If everything was truncated (or finish_reason was 'length'), + // report 'max_tokens' so the agent loop ends cleanly instead of executing a + // malformed call or looping on an empty tool turn. let stopReason: ProviderResult['stopReason']; - if (finish === 'tool_calls' || toolCalls.size > 0) stopReason = 'tool_use'; - else if (finish === 'length') stopReason = 'max_tokens'; - else if (finish === 'stop') stopReason = 'end_turn'; + if (validToolCalls > 0) stopReason = 'tool_use'; + else if (finish === 'length' || truncatedToolCall) stopReason = 'max_tokens'; else stopReason = 'end_turn'; return {