Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .husky/pre-commit
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

set -e

echo "▎ pre-commit: lint"
pnpm lint

echo "▎ pre-commit: typecheck"
pnpm -r typecheck

Expand Down
2 changes: 2 additions & 0 deletions eslint.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ export default [
'**/dist/**',
'**/dist-electron/**',
'**/node_modules/**',
'**/target/**', // Rust/Cargo build output (generated JS in src-tauri/target)
'**/.tsbuildinfo',
'release-artifacts/**',
'apps/desktop/electron/**', // requires electron types — pending M6-rest
],
},
Expand Down
151 changes: 151 additions & 0 deletions packages/core/src/agent.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,157 @@ describe('runAgent', () => {
}
});

it('runs multiple read-only tool calls concurrently and preserves result order', async () => {
const events2: string[] = [];
const delay = (ms: number) => new Promise((r) => setTimeout(r, ms));
const slowReadOnly = (name: string) => ({
name,
definition: { name, description: name, inputSchema: { type: 'object', properties: {} } },
async execute() {
events2.push(`start:${name}`);
await delay(20);
events2.push(`end:${name}`);
return { content: `${name} done` };
},
});
// Custom registry with two read-only-named tools (Grep + Glob ∈ READ_ONLY_TOOLS).
const tools = new ToolRegistry([
slowReadOnly('Grep'),
slowReadOnly('Glob'),
] as unknown as Parameters<typeof ToolRegistry.prototype.register>[0][]);

const provider = new MockProvider([
{
content: [
{ type: 'text', text: 'searching' },
{ type: 'tool_use', id: 'g1', name: 'Grep', input: {} },
{ type: 'tool_use', id: 'g2', name: 'Glob', input: {} },
],
stopReason: 'tool_use',
usage: { inputTokens: 1, outputTokens: 1, reasoningTokens: 0, cacheReadTokens: 0 },
},
endTurn('done'),
]);

const result = await runAgent({
provider,
tools,
systemPrompt: '',
userMessage: 'find things',
model: 'deepseek-chat',
cwd,
});

// Concurrency: both tools start before either finishes.
expect(events2.slice(0, 2).every((e) => e.startsWith('start:'))).toBe(true);
expect(events2.slice(2).every((e) => e.startsWith('end:'))).toBe(true);

// Result order matches the model's call order (Grep then Glob) regardless of
// which promise settled first.
const toolResultMsg = result.history[2]!; // user msg with tool_result blocks
expect(toolResultMsg.role).toBe('user');
const ids = toolResultMsg.content
.filter((b): b is Extract<ContentBlock, { type: 'tool_result' }> => b.type === 'tool_result')
.map((b) => b.tool_use_id);
expect(ids).toEqual(['g1', 'g2']);
});

it('does not auto-compact on cumulative usage when each turn is below threshold', async () => {
// Regression: shouldCompact must use the *current* turn's input tokens, not
// the cumulative sum across turns. contextWindow 100, threshold 0.8 → trigger
// at 80. Each turn reports inputTokens 30 (below 80), so the per-turn proxy
// never crosses — but the cumulative sum (30+30+30=90) would, under the old
// buggy logic, fire compaction on turn 3. Assert it never fires.
await fs.writeFile(join(cwd, 'x.txt'), 'data');

// A provider that counts how many times the compaction summarizer runs
// (identified by the compaction system prompt + empty tool list).
let summarizerCalls = 0;
const turn = (): ProviderResult => ({
content: withToolCall('working', {
type: 'tool_use',
id: `c${Math.random()}`,
name: 'Read',
input: { file_path: 'x.txt' },
}),
stopReason: 'tool_use',
usage: { inputTokens: 30, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 },
});
const scripted: ProviderResult[] = [turn(), turn(), endTurn('done')];
const countingProvider: Provider = {
name: 'counting',
async runTurn(opts: ProviderRunOpts): Promise<ProviderResult> {
if (opts.systemPrompt.startsWith('You compress long agent conversations')) {
summarizerCalls++;
return endTurn('summary');
}
const next = scripted.shift();
if (!next) throw new Error('no scripted response');
return next;
},
};

const result = await runAgent({
provider: countingProvider,
tools: new ToolRegistry(),
systemPrompt: 'agent',
userMessage: 'go',
model: 'deepseek-chat',
cwd,
autoCompact: { contextWindow: 100, threshold: 0.8 },
});

expect(result.stopReason).toBe('end_turn');
expect(summarizerCalls).toBe(0);
});

it('auto-compacts once when a single turn crosses the threshold', async () => {
// Inverse of the above: when the *current* turn's input alone exceeds the
// threshold (90 > 80), compaction should fire. History after one tool turn
// is short, so compact() keeps it verbatim, but the summarizer is still
// invoked — proving the trigger path is live.
await fs.writeFile(join(cwd, 'x.txt'), 'data');
let summarizerCalls = 0;
const scripted: ProviderResult[] = [
{
content: withToolCall('working', {
type: 'tool_use',
id: 'big',
name: 'Read',
input: { file_path: 'x.txt' },
}),
stopReason: 'tool_use',
usage: { inputTokens: 90, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 },
},
endTurn('done'),
];
const provider: Provider = {
name: 'counting',
async runTurn(opts: ProviderRunOpts): Promise<ProviderResult> {
if (opts.systemPrompt.startsWith('You compress long agent conversations')) {
summarizerCalls++;
return endTurn('summary');
}
const next = scripted.shift();
if (!next) throw new Error('no scripted response');
return next;
},
};

await runAgent({
provider,
tools: new ToolRegistry(),
systemPrompt: 'agent',
userMessage: 'go',
model: 'deepseek-chat',
cwd,
// Tiny keep window so compact() doesn't short-circuit on the short history.
autoCompact: { contextWindow: 100, threshold: 0.8, keepFirstPairs: 0, keepLastMessages: 1 },
});

expect(summarizerCalls).toBe(1);
});

it('honors systemReminders: false to skip injection entirely', async () => {
const provider = new MockProvider([endTurn('hi')]);
const tools = new ToolRegistry();
Expand Down
92 changes: 67 additions & 25 deletions packages/core/src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ export interface RunAgentResult {

const DEFAULT_MAX_TURNS = 16;

/**
* Tools with no side effects whose results don't depend on each other — safe to
* execute concurrently within a single turn. Everything else (Edit/Write/Bash/
* TodoWrite/AskUserQuestion/ExitPlanMode) runs sequentially to preserve snapshot
* ordering, mutation order, and one-at-a-time interactive prompts.
*/
const READ_ONLY_TOOLS = new Set(['Read', 'Grep', 'Glob', 'WebFetch', 'WebSearch']);

/**
* Runs the agent loop until the model produces an end_turn (no tool calls),
* or `maxTurns` is reached, or the abort signal fires.
Expand Down Expand Up @@ -233,14 +241,27 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
return { history, turnsUsed, usage: totalUsage, stopReason: 'end_turn', modeSignal };
}

// Execute tool calls and append a single user-role message with tool_result blocks
const toolResults: ToolResultBlock[] = [];
for (const block of result.content) {
if (block.type !== 'tool_use') continue;
const toolUse = block as ToolUseBlock;
// Execute tool calls and append a single user-role message with tool_result
// blocks. Two phases:
// 1. (sequential) resolve handler + permission for each call. Approval
// prompts must never overlap, so gating stays strictly ordered.
// 2. (mixed) execute. Side-effect-free reads run concurrently via
// Promise.all (the common "model emits 3 Reads at once" case); tools
// that mutate state / snapshot run sequentially to preserve ordering.
// tool_result blocks carry their tool_use_id, so the final array is
// re-assembled in the model's original order regardless of finish order.
const toolBlocks = result.content.filter(
(b): b is ToolUseBlock => b.type === 'tool_use',
);
const resultsById = new Map<string, ToolResultBlock>();
type Ready = { toolUse: ToolUseBlock; handler: NonNullable<ReturnType<typeof opts.tools.get>> };
const ready: Ready[] = [];

// Phase 1 — sequential gate + approval.
for (const toolUse of toolBlocks) {
const handler = opts.tools.get(toolUse.name);
if (!handler) {
toolResults.push({
resultsById.set(toolUse.id, {
type: 'tool_result',
tool_use_id: toolUse.id,
content: `Error: tool not found: ${toolUse.name}`,
Expand Down Expand Up @@ -268,7 +289,7 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
allowed = decision === true || decision === 'always';
}
if (!allowed) {
toolResults.push({
resultsById.set(toolUse.id, {
type: 'tool_result',
tool_use_id: toolUse.id,
content: `Tool call blocked: ${verdict.reason}`,
Expand All @@ -287,12 +308,16 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
}
}

// Pre-execution snapshot (Edit/Write only)
if (
opts.enableSnapshots !== false &&
opts.session &&
(toolUse.name === 'Edit' || toolUse.name === 'Write')
) {
ready.push({ toolUse, handler });
}

// Runs one approved tool end-to-end: pre-snapshot, execute, PostToolUse
// hook, post-snapshot, event + result. Side-effect-free tools call this
// concurrently; mutating tools call it one at a time (see partition below).
const execOne = async ({ toolUse, handler }: Ready): Promise<void> => {
const isFileMutation = toolUse.name === 'Edit' || toolUse.name === 'Write';

if (opts.enableSnapshots !== false && opts.session && isFileMutation) {
const filePath = (toolUse.input as { file_path?: string }).file_path;
if (filePath) {
await opts.session.manager.snapshot({
Expand Down Expand Up @@ -327,13 +352,7 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
});
}

// Post-execution snapshot
if (
opts.enableSnapshots !== false &&
opts.session &&
(toolUse.name === 'Edit' || toolUse.name === 'Write') &&
!tr.isError
) {
if (opts.enableSnapshots !== false && opts.session && isFileMutation && !tr.isError) {
const filePath = (toolUse.input as { file_path?: string }).file_path;
if (filePath) {
await opts.session.manager.snapshot({
Expand All @@ -347,13 +366,26 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
}

opts.onEvent?.({ type: 'tool_result', id: toolUse.id, result: tr });
toolResults.push({
resultsById.set(toolUse.id, {
type: 'tool_result',
tool_use_id: toolUse.id,
content: tr.content,
is_error: tr.isError,
});
}
};

// Phase 2 — execute. Read-only tools have no side effects and don't touch
// snapshotSeq, so they're safe to run concurrently; everything else stays
// sequential to keep snapshot ordering deterministic.
const parallel = ready.filter((r) => READ_ONLY_TOOLS.has(r.toolUse.name));
const serial = ready.filter((r) => !READ_ONLY_TOOLS.has(r.toolUse.name));
await Promise.all(parallel.map(execOne));
for (const r of serial) await execOne(r);

// Re-assemble in the model's original tool-call order.
const toolResults: ToolResultBlock[] = toolBlocks
.map((b) => resultsById.get(b.id))
.filter((r): r is ToolResultBlock => r !== undefined);

const resultMsg: StoredMessage = {
role: 'user',
Expand All @@ -363,12 +395,22 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
history.push(resultMsg);
if (opts.session) await opts.session.manager.append(opts.session.id, resultMsg);

// M3c: auto-compact if usage crossed threshold
// M3c: auto-compact if the *current* context crossed the threshold.
//
// Use this turn's usage (result.usage), NOT the cumulative totalUsage.
// `result.usage.inputTokens` is exactly the size of the history we just
// sent to the model, so it is the true current-context proxy. Cumulative
// usage is wrong on two counts: it sums every turn's input (each turn
// re-sends the whole history, so it inflates far past the real window and
// crosses the threshold too early), and it never shrinks after a compaction
// — meaning once over the line it would re-compact the already-compacted
// history on every subsequent turn. The next turn's inputTokens naturally
// reflects the freshly-compacted (smaller) context, so this self-corrects.
if (
opts.autoCompact &&
shouldCompact({
inputTokens: totalUsage.inputTokens,
outputTokens: totalUsage.outputTokens,
inputTokens: result.usage.inputTokens,
outputTokens: result.usage.outputTokens,
contextWindow: opts.autoCompact.contextWindow,
threshold: opts.autoCompact.threshold,
})
Expand Down
Loading
Loading