diff --git a/apps/cli/src/commands.test.ts b/apps/cli/src/commands.test.ts index 53e05a5..33acbce 100644 --- a/apps/cli/src/commands.test.ts +++ b/apps/cli/src/commands.test.ts @@ -28,7 +28,7 @@ function makeContext(overrides: Partial = {}): SessionContext { creds: { apiKey: 'sk-abcdefghij' }, sessionId: 'sess-xyz', sessions: new SessionManager({ root: '/tmp/x' }), - usage: { inputTokens: 100, outputTokens: 50, reasoningTokens: 0 }, + usage: { inputTokens: 100, outputTokens: 50, reasoningTokens: 0, cacheReadTokens: 0 }, ...overrides, }; } @@ -157,11 +157,21 @@ describe('built-in command behavior', () => { it('/cost computes pricing', async () => { const reg = new CommandRegistry(); const ctx = makeContext({ - usage: { inputTokens: 1_000_000, outputTokens: 500_000, reasoningTokens: 0 }, + usage: { + inputTokens: 1_000_000, + outputTokens: 500_000, + reasoningTokens: 0, + cacheReadTokens: 400_000, + }, }); const out = await reg.match('/cost')!.cmd.run([], ctx); - expect(out.join('\n')).toMatch(/Tokens/); - expect(out.join('\n')).toMatch(/Total/); + const text = out.join('\n'); + expect(text).toMatch(/Tokens/); + expect(text).toMatch(/Total/); + // Cache-aware: 400k of 1M input were cache hits → shown + credited. + expect(text).toMatch(/cache hits: 400,000/); + expect(text).toMatch(/40%/); + expect(text).toMatch(/cache saved/i); }); it('/context shows window usage', async () => { diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts index 66b8106..3f45a59 100644 --- a/apps/cli/src/commands.ts +++ b/apps/cli/src/commands.ts @@ -13,6 +13,7 @@ import type { } from '@deepcode/core'; import { contextWindowFor, + estimateCost, redact, EFFORT_PARAMS, type Credentials, @@ -111,7 +112,12 @@ export interface SessionContext { credsStore?: CredentialsStore; sessionId: string; sessions: SessionManager; - usage: { inputTokens: number; outputTokens: number; reasoningTokens: number }; + usage: { + inputTokens: number; + outputTokens: number; + reasoningTokens: number; + cacheReadTokens: number; + }; /** Set true to terminate the REPL after this command. */ exitRequested?: boolean; /** Replace history entirely (used by /clear, /resume). */ @@ -304,20 +310,21 @@ export const CostCommand: SlashCommand = { aliases: ['/usage'], description: 'Show token usage and cost estimate.', run(_args, ctx) { - // Pricing per docs/design/effort-levels.md §2.4 - const inputYuan = (ctx.usage.inputTokens / 1_000_000) * 1.0; - const outputYuan = - ctx.model === 'deepseek-reasoner' - ? (ctx.usage.outputTokens / 1_000_000) * 16.0 - : (ctx.usage.outputTokens / 1_000_000) * 2.0; - const reasoningYuan = - ctx.model === 'deepseek-reasoner' ? (ctx.usage.reasoningTokens / 1_000_000) * 4.0 : 0; - const total = inputYuan + outputYuan + reasoningYuan; - return [ - `Tokens — in: ${ctx.usage.inputTokens.toLocaleString()}, out: ${ctx.usage.outputTokens.toLocaleString()}, reasoning: ${ctx.usage.reasoningTokens.toLocaleString()}`, - `Estimate — input: ¥${inputYuan.toFixed(4)}, output: ¥${outputYuan.toFixed(4)}, reasoning: ¥${reasoningYuan.toFixed(4)}`, - `Total this session: ¥${total.toFixed(4)}`, + // Cache-aware pricing per docs/design/effort-levels.md §2.4. DeepSeek's + // prompt caching is automatic server-side; cache-hit input tokens bill at + // ~10% of a miss, so a stable prompt prefix across turns saves real money. + const c = estimateCost(ctx.usage, ctx.model); + const cacheHits = Math.min(ctx.usage.cacheReadTokens, ctx.usage.inputTokens); + const hitPct = (c.cacheHitRate * 100).toFixed(0); + const lines = [ + `Tokens — in: ${ctx.usage.inputTokens.toLocaleString()} (cache hits: ${cacheHits.toLocaleString()}, ${hitPct}%), out: ${ctx.usage.outputTokens.toLocaleString()}, reasoning: ${ctx.usage.reasoningTokens.toLocaleString()}`, + `Estimate — input ¥${(c.cacheMissYuan + c.cacheHitYuan).toFixed(4)} (miss ¥${c.cacheMissYuan.toFixed(4)} + cache ¥${c.cacheHitYuan.toFixed(4)}), output ¥${c.outputYuan.toFixed(4)}, reasoning ¥${c.reasoningYuan.toFixed(4)}`, + `Total this session: ¥${c.totalYuan.toFixed(4)}`, ]; + if (c.cacheSavingsYuan > 0) { + lines.push(`Prompt cache saved ¥${c.cacheSavingsYuan.toFixed(4)} vs no caching.`); + } + return lines; }, }; diff --git a/apps/cli/src/parity-commands.test.ts b/apps/cli/src/parity-commands.test.ts index 6170ae3..ec6c948 100644 --- a/apps/cli/src/parity-commands.test.ts +++ b/apps/cli/src/parity-commands.test.ts @@ -30,7 +30,7 @@ function ctx(overrides: Partial = {}): SessionContext { creds: { apiKey: 'sk-test' }, sessionId: 's1', sessions: new SessionManager({ root: '/tmp/x' }), - usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0 }, + usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, ...overrides, }; } diff --git a/apps/cli/src/repl.ts b/apps/cli/src/repl.ts index cbe27d1..a00ab51 100644 --- a/apps/cli/src/repl.ts +++ b/apps/cli/src/repl.ts @@ -421,7 +421,7 @@ export async function startRepl(opts: ReplOpts): Promise { credsStore, sessionId: session.id, sessions, - usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0 }, + usage: { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }, mcpServers, mcpErrors, wiredPlugins: pluginsWire?.plugins.map((p) => ({ @@ -637,6 +637,7 @@ export async function startRepl(opts: ReplOpts): Promise { ctx.usage.inputTokens += result.usage.inputTokens; ctx.usage.outputTokens += result.usage.outputTokens; ctx.usage.reasoningTokens += result.usage.reasoningTokens; + ctx.usage.cacheReadTokens += result.usage.cacheReadTokens; // M3c-rest: honor ExitPlanMode tool signal — flip plan → default if (result.modeSignal?.exitPlanMode && ctx.mode === 'plan') { ctx.mode = 'default'; diff --git a/packages/core/src/agent.ts b/packages/core/src/agent.ts index a377cfe..bc1b02b 100644 --- a/packages/core/src/agent.ts +++ b/packages/core/src/agent.ts @@ -121,7 +121,12 @@ export interface RunAgentResult { /** Total provider round-trips executed. */ turnsUsed: number; /** Aggregate token usage. */ - usage: { inputTokens: number; outputTokens: number; reasoningTokens: number }; + usage: { + inputTokens: number; + outputTokens: number; + reasoningTokens: number; + cacheReadTokens: number; + }; /** Reason the loop terminated. */ stopReason: 'end_turn' | 'max_turns' | 'aborted' | 'error'; /** Mode-control signals flipped by tools during this run (M3c-rest). */ @@ -380,7 +385,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { }); } - const totalUsage = { inputTokens: 0, outputTokens: 0, reasoningTokens: 0 }; + const totalUsage = { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0 }; let turnsUsed = 0; // Stop hook — fires when the TOP-LEVEL agent finishes a run (a sub-agent's @@ -437,6 +442,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { totalUsage.inputTokens += result.usage.inputTokens; totalUsage.outputTokens += result.usage.outputTokens; totalUsage.reasoningTokens += result.usage.reasoningTokens; + totalUsage.cacheReadTokens += result.usage.cacheReadTokens; opts.onEvent?.({ type: 'usage', inputTokens: result.usage.inputTokens, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index b4aeea0..344b3d7 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -15,6 +15,8 @@ export { DEFAULT_CONTEXT_WINDOW, EFFORT_PARAMS, contextWindowFor, + estimateCost, + type CostBreakdown, type DeepSeekProviderOpts, type Provider, type ProviderResult, diff --git a/packages/core/src/providers/index.ts b/packages/core/src/providers/index.ts index fdfe50f..b712596 100644 --- a/packages/core/src/providers/index.ts +++ b/packages/core/src/providers/index.ts @@ -16,3 +16,5 @@ export { contextWindowFor, } from './deepseek.js'; export type { DeepSeekProviderOpts } from './deepseek.js'; +export { estimateCost } from './pricing.js'; +export type { CostBreakdown } from './pricing.js'; diff --git a/packages/core/src/providers/pricing.test.ts b/packages/core/src/providers/pricing.test.ts new file mode 100644 index 0000000..1c44ea6 --- /dev/null +++ b/packages/core/src/providers/pricing.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it } from 'vitest'; +import { estimateCost } from './pricing.js'; +import type { ProviderUsage } from './types.js'; + +function usage(p: Partial): ProviderUsage { + return { inputTokens: 0, outputTokens: 0, reasoningTokens: 0, cacheReadTokens: 0, ...p }; +} + +describe('estimateCost', () => { + it('prices deepseek-chat input + output with no cache hits', () => { + const c = estimateCost( + usage({ inputTokens: 1_000_000, outputTokens: 1_000_000 }), + 'deepseek-chat', + ); + expect(c.cacheMissYuan).toBeCloseTo(1.0, 6); + expect(c.cacheHitYuan).toBe(0); + expect(c.outputYuan).toBeCloseTo(2.0, 6); + expect(c.reasoningYuan).toBe(0); + expect(c.totalYuan).toBeCloseTo(3.0, 6); + expect(c.cacheHitRate).toBe(0); + expect(c.cacheSavingsYuan).toBe(0); + }); + + it('credits cache-hit tokens at 10% (inputTokens is inclusive of cache hits)', () => { + // 1M input of which 800k are cache hits → 200k miss @¥1/M + 800k hit @¥0.1/M. + const c = estimateCost( + usage({ inputTokens: 1_000_000, cacheReadTokens: 800_000 }), + 'deepseek-chat', + ); + expect(c.cacheMissYuan).toBeCloseTo(0.2, 6); // 200k @ ¥1/M + expect(c.cacheHitYuan).toBeCloseTo(0.08, 6); // 800k @ ¥0.1/M + expect(c.totalYuan).toBeCloseTo(0.28, 6); + expect(c.cacheHitRate).toBeCloseTo(0.8, 6); + expect(c.cacheSavingsYuan).toBeCloseTo(0.72, 6); // 800k @ (1.0−0.1)/M + }); + + it('prices reasoner output + reasoning higher', () => { + const c = estimateCost( + usage({ outputTokens: 1_000_000, reasoningTokens: 1_000_000 }), + 'deepseek-reasoner', + ); + expect(c.outputYuan).toBeCloseTo(16.0, 6); + expect(c.reasoningYuan).toBeCloseTo(4.0, 6); + }); + + it('clamps cache hits to input and falls back to chat pricing for unknown models', () => { + const c = estimateCost(usage({ inputTokens: 100, cacheReadTokens: 999 }), 'mystery-model'); + expect(c.cacheHitRate).toBe(1); // hits clamped to ≤ input + expect(c.cacheMissYuan).toBe(0); // all input was a cache hit + }); + + it('is zero for an empty session', () => { + const c = estimateCost(usage({}), 'deepseek-chat'); + expect(c.totalYuan).toBe(0); + expect(c.cacheHitRate).toBe(0); + }); +}); diff --git a/packages/core/src/providers/pricing.ts b/packages/core/src/providers/pricing.ts new file mode 100644 index 0000000..9f9d179 --- /dev/null +++ b/packages/core/src/providers/pricing.ts @@ -0,0 +1,77 @@ +// DeepSeek cost estimation (CNY), crediting server-side prompt caching. +// Prices per docs/design/effort-levels.md §2.4 (per 1M tokens): +// model input(miss) cache-hit output reasoning +// deepseek-chat ¥1 ¥0.1 ¥2 — +// deepseek-reasoner ¥1 ¥0.1 ¥16 ¥4 (reasoning_content) +// +// DeepSeek's `prompt_tokens` (→ usage.inputTokens) is INCLUSIVE of the +// cache-hit tokens (→ usage.cacheReadTokens), so cache-miss = input − cache-hit. +// Cache hits bill at ~10% of a miss, so crediting them matters for long sessions +// with a stable prompt prefix (the agent's system prompt + early turns). + +import type { ProviderUsage } from './types.js'; + +interface ModelPricing { + /** Cache-miss prompt tokens, ¥ per 1M. */ + inputMissPerM: number; + /** Cache-hit prompt tokens, ¥ per 1M. */ + cacheHitPerM: number; + /** Completion tokens, ¥ per 1M. */ + outputPerM: number; + /** reasoning_content tokens, ¥ per 1M (reasoner only). */ + reasoningPerM: number; +} + +const PRICING: Record = { + 'deepseek-chat': { inputMissPerM: 1.0, cacheHitPerM: 0.1, outputPerM: 2.0, reasoningPerM: 0 }, + 'deepseek-reasoner': { + inputMissPerM: 1.0, + cacheHitPerM: 0.1, + outputPerM: 16.0, + reasoningPerM: 4.0, + }, +}; + +export interface CostBreakdown { + /** Cache-miss input cost (¥). */ + cacheMissYuan: number; + /** Cache-hit input cost (¥) — the discounted prompt-cache reads. */ + cacheHitYuan: number; + outputYuan: number; + reasoningYuan: number; + totalYuan: number; + /** cacheReadTokens / inputTokens, 0..1 (0 when no input). */ + cacheHitRate: number; + /** ¥ saved vs paying the full miss price for every input token. */ + cacheSavingsYuan: number; +} + +/** + * Estimate session cost in CNY from cumulative usage, crediting DeepSeek's + * cheaper cache-hit input tokens. Unknown models fall back to deepseek-chat + * pricing. Pure — safe to call anywhere. + */ +export function estimateCost(usage: ProviderUsage, model: string): CostBreakdown { + const p = PRICING[model] ?? PRICING['deepseek-chat']!; + const hitTokens = Math.max(0, Math.min(usage.cacheReadTokens, usage.inputTokens)); + const missTokens = Math.max(0, usage.inputTokens - hitTokens); + + const cacheMissYuan = (missTokens / 1_000_000) * p.inputMissPerM; + const cacheHitYuan = (hitTokens / 1_000_000) * p.cacheHitPerM; + const outputYuan = (usage.outputTokens / 1_000_000) * p.outputPerM; + const reasoningYuan = (usage.reasoningTokens / 1_000_000) * p.reasoningPerM; + + // What those cache-hit tokens WOULD have cost at the miss price, minus what + // they actually cost — i.e. the prompt-cache discount. + const cacheSavingsYuan = (hitTokens / 1_000_000) * (p.inputMissPerM - p.cacheHitPerM); + + return { + cacheMissYuan, + cacheHitYuan, + outputYuan, + reasoningYuan, + totalYuan: cacheMissYuan + cacheHitYuan + outputYuan + reasoningYuan, + cacheHitRate: usage.inputTokens > 0 ? hitTokens / usage.inputTokens : 0, + cacheSavingsYuan, + }; +}