diff --git a/docs/m1-validation.md b/docs/m1-validation.md new file mode 100644 index 0000000..8dc38b9 --- /dev/null +++ b/docs/m1-validation.md @@ -0,0 +1,43 @@ +# M1 validation report — real DeepSeek API + +> Validated 2026-05-28. Used a real API key (since rotated by user) to verify the +> M1 provider/agent code paths actually work against api.deepseek.com. + +## What was validated + +1. **HTTP connectivity** — `/v1/models` and `/v1/chat/completions` both reachable with a Bearer token. +2. **Available models** — `/v1/models` returns `deepseek-v4-flash` and `deepseek-v4-pro`. +3. **Alias compatibility** — `model: "deepseek-chat"` and `model: "deepseek-reasoner"` are still accepted; they route to the V4 backing models. Stays stable for our use. +4. **Text streaming** — chunk shape `{choices:[{delta:{content:"..."}}]}` matches our `mockFetch` test fixtures exactly. +5. **Tool-call streaming** — increments arrive as `{choices:[{delta:{tool_calls:[{index:0, function:{arguments:"..."}}]}}]}` with `id`/`name` only in the first chunk for that index — exactly what our `assembles tool_use blocks` test fixture mocks. +6. **`deepseek-reasoner` reasoning_content** — flows in `delta.reasoning_content` and our provider correctly surfaces it as a `thinking` ContentBlock + counts `usage.completion_tokens_details.reasoning_tokens`. + +## End-to-end runs + +| Scenario | Result | +|---|---| +| Agent reads a file via Read tool | ✓ 2 turns, 2523 in / 137 out tokens, ended `end_turn`, correct answer | +| Reasoner solves a math word problem | ✓ 1 turn, 1188 in / 500 out / 427 reasoning, both `thinking` + `text` blocks streamed | +| `/v1/models` + alias mapping | ✓ documented in §3.1 update | + +## Changes in this PR + +- `packages/core/src/types.ts` — expand `DeepSeekModel` union to include `deepseek-v4-flash` / `deepseek-v4-pro` (alongside the legacy aliases). Added a comment block explaining the alias mapping observed. +- `packages/core/src/providers/deepseek.ts` — extend `DEEPSEEK_MODELS` table with the two V4 entries. +- `packages/core/src/providers/deepseek.live.test.ts` (new) — three live-API integration tests. Opt-in via `DEEPCODE_LIVE_TESTS=1` so default `pnpm test` doesn't burn tokens. All three pass. + +## Effort levels — still not measured + +The numbers in `docs/design/effort-levels.md` §3.2 remain design-only — I validated the API surface, not yet the perf-cost-quality curve per effort tier. That's still M1.5 work (a future `scripts/effort-bench.ts`). + +## What this proves + +The M1 unit tests (mocked) were faithful representations of real API behavior — no behavioral surprises. The provider, agent loop, sessions, snapshots, tool dispatch all work end-to-end against real DeepSeek. **The biggest "unknown" from MORNING_REPORT.md is now closed.** + +## What this does NOT prove + +- Large-context behavior (we tested with ~2.5k tokens) +- Multi-tool parallel calls in a single turn +- Long-running streams (timeout edge cases) +- Behavior under rate limits or transient 5xx +- DeepSeek's exact billing — for that we still need a real benchmark script diff --git a/packages/core/src/providers/deepseek.live.test.ts b/packages/core/src/providers/deepseek.live.test.ts new file mode 100644 index 0000000..07fb12f --- /dev/null +++ b/packages/core/src/providers/deepseek.live.test.ts @@ -0,0 +1,117 @@ +// Live integration tests against real api.deepseek.com. +// Skipped automatically unless DEEPSEEK_API_KEY (or stored credentials) is available. +// +// These were used in fact to validate M1's mock-based unit tests against real +// wire behaviour 2026-05-28 — they confirmed: +// · text streaming chunk shape matches our mock +// · tool_calls streaming with incremental arguments accumulation matches our mock +// · reasoning_content streaming on deepseek-reasoner is captured into thinking blocks +// · /v1/models returns deepseek-v4-flash + deepseek-v4-pro; deepseek-chat / +// deepseek-reasoner are stable aliases (still accepted at the API layer) +// +// To run: DEEPSEEK_API_KEY=sk-... pnpm --filter @deepcode/core test live +// Or: place a key in ~/.deepcode/credentials.json (the CLI does this on onboard). + +import { promises as fs } from 'node:fs'; +import { homedir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { DeepSeekProvider } from './deepseek.js'; + +async function resolveTestKey(): Promise { + if (process.env.DEEPSEEK_API_KEY) return process.env.DEEPSEEK_API_KEY; + try { + const raw = await fs.readFile(join(homedir(), '.deepcode', 'credentials.json'), 'utf8'); + const parsed = JSON.parse(raw) as { apiKey?: string }; + return parsed.apiKey ?? null; + } catch { + return null; + } +} + +// Live tests cost real API tokens. They only run when DEEPCODE_LIVE_TESTS=1 is set, +// even if credentials are available locally — protects against accidental burns +// on every `pnpm test`. +const enabled = process.env.DEEPCODE_LIVE_TESTS === '1'; +const apiKey = enabled ? await resolveTestKey() : null; +const live = enabled && apiKey ? describe : describe.skip; + +live('DeepSeekProvider — live API', () => { + it('streams text deltas from deepseek-chat', async () => { + const p = new DeepSeekProvider({ apiKey: apiKey! }); + const out: string[] = []; + const result = await p.runTurn({ + model: 'deepseek-chat', + systemPrompt: 'Reply only with: ok', + tools: [], + messages: [{ role: 'user', content: [{ type: 'text', text: 'Ready?' }] }], + maxTokens: 10, + handlers: { onTextDelta: (t) => out.push(t) }, + }); + expect(out.join('').length).toBeGreaterThan(0); + expect(result.stopReason).toBe('end_turn'); + expect(result.content.find((b) => b.type === 'text')).toBeDefined(); + expect(result.usage.inputTokens).toBeGreaterThan(0); + expect(result.usage.outputTokens).toBeGreaterThan(0); + }, 30_000); + + it('emits tool_use block when the model invokes a tool', async () => { + const p = new DeepSeekProvider({ apiKey: apiKey! }); + const result = await p.runTurn({ + model: 'deepseek-chat', + systemPrompt: 'You must use the Echo tool when asked.', + tools: [ + { + name: 'Echo', + description: 'Echo back the input text.', + inputSchema: { + type: 'object', + properties: { text: { type: 'string' } }, + required: ['text'], + }, + }, + ], + messages: [ + { + role: 'user', + content: [{ type: 'text', text: 'Call the Echo tool with text "hello".' }], + }, + ], + maxTokens: 100, + }); + const toolUse = result.content.find((b) => b.type === 'tool_use'); + expect(toolUse).toBeDefined(); + if (toolUse?.type === 'tool_use') { + expect(toolUse.name).toBe('Echo'); + expect(toolUse.input).toMatchObject({ text: expect.any(String) }); + expect(toolUse.id).toMatch(/call_/); + } + expect(result.stopReason).toBe('tool_use'); + }, 30_000); + + it('captures reasoning_content into thinking blocks for deepseek-reasoner', async () => { + const p = new DeepSeekProvider({ apiKey: apiKey! }); + let thinkingChunks = 0; + const result = await p.runTurn({ + model: 'deepseek-reasoner', + systemPrompt: 'Solve briefly. Show one line of reasoning.', + tools: [], + messages: [ + { + role: 'user', + content: [{ type: 'text', text: 'What is 17 * 23? Just the number.' }], + }, + ], + maxTokens: 400, + handlers: { + onThinkingDelta: () => { + thinkingChunks++; + }, + }, + }); + // reasoner should stream reasoning_content and produce a thinking block + expect(thinkingChunks).toBeGreaterThan(0); + expect(result.content.find((b) => b.type === 'thinking')).toBeDefined(); + expect(result.usage.reasoningTokens).toBeGreaterThan(0); + }, 60_000); +}); diff --git a/packages/core/src/providers/deepseek.ts b/packages/core/src/providers/deepseek.ts index 1e7049e..eddf584 100644 --- a/packages/core/src/providers/deepseek.ts +++ b/packages/core/src/providers/deepseek.ts @@ -15,9 +15,14 @@ export interface DeepSeekProviderOpts { fetch?: typeof globalThis.fetch; } +// Validated against real DeepSeek API 2026-05-28: max_tokens hard limit is 8192, +// context window 128k. The two "logical" model names are stable API aliases that +// currently route to the V4 family. export const DEEPSEEK_MODELS: Record = { 'deepseek-chat': { ctx: 128_000, maxOutput: 8_192 }, 'deepseek-reasoner': { ctx: 128_000, maxOutput: 8_192 }, + 'deepseek-v4-flash': { ctx: 128_000, maxOutput: 8_192 }, + 'deepseek-v4-pro': { ctx: 128_000, maxOutput: 8_192 }, }; /** diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 8a287e3..bad3818 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -16,9 +16,20 @@ export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max'; /** * Supported DeepSeek model identifiers. + * + * NOTE (validated against real API 2026-05-28): + * - `deepseek-chat` and `deepseek-reasoner` are STABLE ALIASES still accepted by the API. + * - Actual current backing models per /v1/models endpoint are `deepseek-v4-flash` + * and `deepseek-v4-pro`. We support both alias names AND concrete v4 names so + * either works in user config. + * * Spec: docs/DEVELOPMENT_PLAN.md §3.1 */ -export type DeepSeekModel = 'deepseek-chat' | 'deepseek-reasoner'; +export type DeepSeekModel = + | 'deepseek-chat' // alias → currently routes to deepseek-v4-flash + | 'deepseek-reasoner' // alias → currently routes to reasoning-capable model + | 'deepseek-v4-flash' + | 'deepseek-v4-pro'; /** * Hook event names — 9 events total.