diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 00000000..513a873e --- /dev/null +++ b/.eslintignore @@ -0,0 +1,3 @@ +# Stale agent worktrees produced by parallel Claude Code sessions — they +# hold their own branches and are linted as part of their own runs. +.claude/worktrees/ diff --git a/.oxlintrc.json b/.oxlintrc.json index 3e2ccf26..6158a462 100644 --- a/.oxlintrc.json +++ b/.oxlintrc.json @@ -28,6 +28,7 @@ "no-undef": "off", "no-underscore-dangle": "off", "no-useless-undefined": "off", + "require-unicode-regexp": "off", "no-warning-comments": "off", "prefer-destructuring": "off", "sort-imports": "off", diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index 7e300f45..152e3f98 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,10 +189,14 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'p90', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'), + selectedXAxisMode: 'interactivity' as const, + setSelectedXAxisMode: namedStub('setSelectedXAxisMode'), scaleType: 'auto', setScaleType: namedStub('setScaleType'), isLegendExpanded: true, diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx new file mode 100644 index 00000000..77f29805 --- /dev/null +++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx @@ -0,0 +1,17 @@ +import type { Metadata } from 'next'; + +import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail'; + +export const metadata: Metadata = { + title: 'Agentic trace detail | InferenceX', + robots: { index: false }, +}; + +export default async function AgenticPointDetailPage({ + params, +}: { + params: Promise<{ id: string }>; +}) { + const { id } = await params; + return ; +} diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index a90e26fc..3d2d0da7 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -33,6 +33,10 @@ export function normalizeArtifactRows( if (!params) continue; const { config } = params; results.push({ + // Synthetic id — overlay rows aren't persisted, so trace_replay lookups + // (keyed on benchmark_results.id) will always miss, which is the + // intended behaviour: overlays never have stored trace_replay blobs. + id: 0, hardware: config.hardware, framework: config.framework, model: config.model, @@ -50,6 +54,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts new file mode 100644 index 00000000..63cb2dc0 --- /dev/null +++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts @@ -0,0 +1,64 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getAgenticAggregates, + type AgenticAggregateMap, +} from '@semianalysisai/inferencex-db/queries/agentic-aggregates'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: response stays small (a few numbers per id), but generating it +// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the +// "Aggregates" toggle stays snappy. +const getCachedAgenticAggregates = cachedQuery( + (ids: number[]): Promise => getAgenticAggregates(getDb(), ids), + 'agentic-aggregates', + { blobOnly: true }, +); + +const MAX_IDS_PER_REQUEST = 200; + +/** + * GET /api/v1/agentic-aggregates?ids=1,2,3 + * + * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization, + * and prefix cache hit rate — computed live from the stored aiperf + * profile_export.jsonl + server_metrics_json blobs. Ids without a + * trace_replay blob (or with no usable samples) get nulls. + */ +export async function GET(request: NextRequest) { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > MAX_IDS_PER_REQUEST) { + return NextResponse.json( + { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` }, + { status: 400 }, + ); + } + + try { + const sorted = [...ids].toSorted((a, b) => a - b); + const result = await getCachedAgenticAggregates(sorted); + return cachedJson(result); + } catch (error) { + console.error('Error fetching agentic aggregates:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts new file mode 100644 index 00000000..14c1d461 --- /dev/null +++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts @@ -0,0 +1,38 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getBenchmarkSiblings, + type BenchmarkSiblings, +} from '@semianalysisai/inferencex-db/queries/benchmark-siblings'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedSiblings = cachedQuery( + (id: number): Promise => getBenchmarkSiblings(getDb(), id), + 'benchmark-siblings', +); + +/** + * GET /api/v1/benchmark-siblings?id=N + * + * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the + * benchmark_result + all sibling rows that share that SKU within the same + * workflow_run. Used by the agentic detail page to render a navigator. + */ +export async function GET(request: NextRequest) { + const id = Number(request.nextUrl.searchParams.get('id')); + if (!id || !Number.isFinite(id)) { + return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 }); + } + try { + const data = await getCachedSiblings(id); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching benchmark siblings:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts index 780f775e..92d5f326 100644 --- a/packages/app/src/app/api/v1/benchmarks/route.test.ts +++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts @@ -59,6 +59,7 @@ describe('GET /api/v1/benchmarks', () => { ['dsr1'], undefined, undefined, + undefined, ); }); @@ -72,6 +73,7 @@ describe('GET /api/v1/benchmarks', () => { ['dsr1'], '2026-03-01', undefined, + undefined, ); }); @@ -82,7 +84,27 @@ describe('GET /api/v1/benchmarks', () => { req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&exact=true'), ); expect(res.status).toBe(200); - expect(mockGetLatestBenchmarks).toHaveBeenCalledWith('mock-sql', ['dsr1'], '2026-03-01', true); + expect(mockGetLatestBenchmarks).toHaveBeenCalledWith( + 'mock-sql', + ['dsr1'], + '2026-03-01', + true, + undefined, + ); + }); + + it('passes runId param to query when provided', async () => { + mockGetLatestBenchmarks.mockResolvedValueOnce([]); + + const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=26194160120')); + expect(res.status).toBe(200); + expect(mockGetLatestBenchmarks).toHaveBeenCalledWith( + 'mock-sql', + ['dsr1'], + undefined, + undefined, + '26194160120', + ); }); it('returns 500 when query throws', async () => { diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts index c79f1aa7..c4037208 100644 --- a/packages/app/src/app/api/v1/benchmarks/route.ts +++ b/packages/app/src/app/api/v1/benchmarks/route.ts @@ -11,10 +11,10 @@ import { loadFixture } from '@/lib/test-fixtures'; export const dynamic = 'force-dynamic'; const getCachedBenchmarks = cachedQuery( - (dbModelKeys: string[], date?: string, exact?: boolean) => { + (dbModelKeys: string[], date?: string, exact?: boolean, runId?: string) => { if (JSON_MODE) return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact)); - return getLatestBenchmarks(getDb(), dbModelKeys, date, exact); + return getLatestBenchmarks(getDb(), dbModelKeys, date, exact, runId); }, 'benchmarks', { blobOnly: true }, @@ -25,6 +25,7 @@ export async function GET(request: NextRequest) { const model = params.get('model') ?? ''; const date = params.get('date') ?? undefined; const exact = params.get('exact') === 'true'; + const runId = params.get('runId') ?? undefined; const dbModelKeys = DISPLAY_MODEL_TO_DB[model]; if (!dbModelKeys || dbModelKeys.length === 0) { return NextResponse.json({ error: 'Unknown model' }, { status: 400 }); @@ -32,7 +33,7 @@ export async function GET(request: NextRequest) { if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks')); try { - const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined); + const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId); return cachedJson(rows); } catch (error) { console.error('Error fetching benchmarks:', error); diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts new file mode 100644 index 00000000..6ce7c017 --- /dev/null +++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts @@ -0,0 +1,71 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getDerivedAgenticMetrics, + type DerivedAgenticMetricMap, +} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: the response is one entry per id with two numbers, but the +// derivation work parses thousands of JSONL records per blob — cache the +// computed result so a chart-refresh hits the warm path. +// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user. +// Stale v1 cache entries return undefined for the new field and silently +// blank the chart with "No data available". +const getCachedDerivedAgenticMetrics = cachedQuery( + (ids: number[]): Promise => getDerivedAgenticMetrics(getDb(), ids), + 'derived-agentic-metrics-v2', + { blobOnly: true }, +); + +const MAX_IDS_PER_REQUEST = 200; + +/** + * GET /api/v1/derived-agentic-metrics?ids=1,2,3 + * + * Returns per-id derived metrics computed live from the stored aiperf + * profile_export.jsonl blobs: + * - normalized_session_time_s: mean across sessions of session e2e time + * (Σ per-turn request_latency) rescaled by mean_load / session_load. + * - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT) + * across every turn in every session. + * + * Ids without a trace_replay blob or with unparseable records are omitted. + */ +export async function GET(request: NextRequest) { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > MAX_IDS_PER_REQUEST) { + return NextResponse.json( + { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` }, + { status: 400 }, + ); + } + + try { + const sorted = [...ids].toSorted((a, b) => a - b); + const result = await getCachedDerivedAgenticMetrics(sorted); + return cachedJson(result); + } catch (error) { + console.error('Error fetching derived agentic metrics:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts new file mode 100644 index 00000000..6c884fb2 --- /dev/null +++ b/packages/app/src/app/api/v1/request-timeline/route.ts @@ -0,0 +1,40 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getRequestTimeline, + type RequestTimeline, +} from '@semianalysisai/inferencex-db/queries/request-timeline'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedRequestTimeline = cachedQuery( + (id: number): Promise => getRequestTimeline(getDb(), id), + 'request-timeline', + { blobOnly: true }, +); + +/** + * GET /api/v1/request-timeline?id=N + * + * Returns the per-request Gantt timeline for one agentic benchmark point. + * Each request entry has ns-from-start offsets for credit/start/ack/end, + * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the + * point has no stored profile_export.jsonl blob. + */ +export async function GET(request: NextRequest) { + const id = Number(request.nextUrl.searchParams.get('id')); + if (!id || !Number.isFinite(id)) { + return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 }); + } + try { + const data = await getCachedRequestTimeline(id); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching request timeline:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts new file mode 100644 index 00000000..7a959a65 --- /dev/null +++ b/packages/app/src/app/api/v1/trace-histograms/route.ts @@ -0,0 +1,65 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceHistograms, + type TraceHistogramMap, +} from '@semianalysisai/inferencex-db/queries/trace-histograms'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB +// unstable_cache limit (each point carries one int per request, ~500-1000+ +// requests for agentic), which manifests as a 500 from the route. Blob +// storage lets us cache the larger response without losing the warm-cache hit. +const getCachedTraceHistograms = cachedQuery( + (ids: number[]): Promise => getTraceHistograms(getDb(), ids), + 'trace-histograms', + { blobOnly: true }, +); + +const MAX_IDS_PER_REQUEST = 200; + +/** + * GET /api/v1/trace-histograms?ids=1,2,3 + * + * Returns per-request ISL/OSL arrays parsed from the stored aiperf + * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`. + * Ids without a trace_replay blob are omitted from the response. + */ +export async function GET(request: NextRequest) { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > MAX_IDS_PER_REQUEST) { + return NextResponse.json( + { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` }, + { status: 400 }, + ); + } + + try { + // Sort the cache key so the same set of ids in any order hits the same entry. + const sorted = [...ids].toSorted((a, b) => a - b); + const histograms = await getCachedTraceHistograms(sorted); + return cachedJson(histograms); + } catch (error) { + console.error('Error fetching trace histograms:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts new file mode 100644 index 00000000..7346a3e8 --- /dev/null +++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts @@ -0,0 +1,40 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceServerMetrics, + type TraceServerMetrics, +} from '@semianalysisai/inferencex-db/queries/trace-server-metrics'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceServerMetrics = cachedQuery( + (id: number): Promise => getTraceServerMetrics(getDb(), id), + 'trace-server-metrics', + { blobOnly: true }, +); + +/** + * GET /api/v1/trace-server-metrics?id=N + * + * Returns parsed time-series for the agentic detail view: KV cache usage, + * prefix cache hit rate per interval, queue depth, and per-source prompt + * token rates. Times are in seconds from benchmark start. 404 if the point + * has no stored server_metrics_export.json blob. + */ +export async function GET(request: NextRequest) { + const id = Number(request.nextUrl.searchParams.get('id')); + if (!id || !Number.isFinite(id)) { + return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 }); + } + try { + const data = await getCachedTraceServerMetrics(id); + if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 }); + return cachedJson(data); + } catch (error) { + console.error('Error fetching trace server metrics:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index d2a171ee..2dd40e0c 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -12,6 +12,8 @@ import { useState, } from 'react'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; + // useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway). const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect; @@ -22,8 +24,6 @@ function isEnumValue>(e: T, v: string): v is T[ const RUNDATE_RE = /^\d{4}-\d{2}-\d{2}$/u; const RUNID_RE = /^[A-Za-z0-9_-]{1,64}$/u; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; - import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; import { useUrlState } from '@/hooks/useUrlState'; @@ -99,7 +99,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, @@ -146,7 +148,11 @@ export function GlobalFilterProvider({ const [selectedSequence, setSelectedSequence] = useState(() => { if (initialSequence) return initialSequence; - return Sequence.EightK_OneK; + const urlSeq = getUrlParam('i_seq'); + if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const [selectedPrecisions, setSelectedPrecisionsRaw] = useState(() => { @@ -267,9 +273,7 @@ export function GlobalFilterProvider({ if (!availabilityRows) { return unofficialSeqs.length > 0 ? [...new Set(unofficialSeqs)] : SEQUENCE_OPTIONS; } - const dbSeqs = modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null); + const dbSeqs = modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null); const merged = [...new Set([...dbSeqs, ...unofficialSeqs])]; return merged.length > 0 ? merged : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows, unofficialAvailable, selectedModel]); @@ -288,7 +292,7 @@ export function GlobalFilterProvider({ if (!availabilityRows) { return unofficialPrecs.length > 0 ? [...new Set(unofficialPrecs)].toSorted() : ['fp4']; } - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const dbPrecs = rows.map((r) => r.precision); const merged = [...new Set([...dbPrecs, ...unofficialPrecs])].toSorted(); return merged.length > 0 ? merged : ['fp4']; @@ -304,7 +308,7 @@ export function GlobalFilterProvider({ // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 17ce37b5..2e5a245f 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import { FAVORITE_PRESETS, @@ -43,7 +43,7 @@ import { import { useUrlState } from '@/hooks/useUrlState'; import { buildAvailabilityHwKey } from '@/lib/chart-utils'; import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants'; -import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings'; +import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings'; import { MtpEngineConflictToast, type MtpEngineConflictDetail, @@ -128,10 +128,51 @@ export function InferenceProvider({ () => getUrlParam('i_metric') || 'y_tpPerGpu', ); const [selectedXAxisMetric, setSelectedXAxisMetric] = useState( - () => getUrlParam('i_xmetric') || 'p99_ttft', + () => getUrlParam('i_xmetric') || 'p90_ttft', ); const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState( - () => getUrlParam('i_e2e_xmetric') || null, + () => getUrlParam('i_e2e_xmetric') || 'p90_ttft', + ); + // Selected chart variant. Initialize from URL only — SSR cannot read URL, so + // computing a kind-based default here would diverge between server and client + // and cause a hydration mismatch. The scenario-kind default is applied in a + // post-mount effect below (and a ref tracks whether the user has overridden). + type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; + const VALID_X_MODES: XAxisMode[] = [ + 'ttft', + 'e2e', + 'interactivity', + 'session-time', + 'prefill-tps', + ]; + // SSR has no URL access, so seed with a fixed default and apply the URL + // value (if any) in a post-mount effect — keeps server + client first render + // identical and avoids "didn't match" hydration warnings when the URL holds + // a non-default mode. + const [selectedXAxisMode, setSelectedXAxisMode] = useState('ttft'); + const xAxisModeFromUrlRef = useRef(false); + useEffect(() => { + if (xAxisModeFromUrlRef.current) return; + const v = getUrlParam('i_xmode'); + if (v && (VALID_X_MODES as string[]).includes(v)) { + xAxisModeFromUrlRef.current = true; + setSelectedXAxisMode(v as XAxisMode); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the + // existing useChartData pipeline keys off that flag for the e2e chart's x-axis. + const handleSetXAxisMode = useCallback((mode: XAxisMode) => { + xAxisModeFromUrlRef.current = true; + setSelectedXAxisMode(mode); + // The e2e chart's x-axis metric is reconciled in a separate effect below, + // because it depends on sequence kind (fixed-seq has no p90_* metrics) and + // the agentic percentile, both of which can change independently. + }, []); + // Latency percentile applied to the chart x-axis for agentic scenarios. + // Values: 'p90' | 'p99'. Non-agentic charts ignore. + const [selectedPercentile, setSelectedPercentile] = useState( + () => getUrlParam('i_pctl') || 'p90', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', @@ -188,6 +229,39 @@ export function InferenceProvider({ // ── Data fetching (gated by isActive) ────────────────────────────────────── const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined; + // Run-selector scoping: only constrain benchmark data to a specific run when + // there's actually a disambiguation to make for the CURRENT model. The + // raw `availableRuns` is across ALL models on the date, so the picker may + // auto-select a run that produced nothing for the current model — passing + // that runId would return zero rows and hide the chart entirely. + // Compute the set of runs whose CHANGELOG explicitly mentions this model + + // precision. We can't reuse `filterRunsByModel` here because it has a + // fallback that returns all runs when nothing matches (so the picker still + // renders) — which would make us pass a runId that produced no rows for + // the current model, hiding the chart. + const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING) + .filter(([, model]) => model === selectedModel) + .map(([prefix]) => prefix); + const runIdsWithModelChangelog: string[] = []; + if (availableRuns) { + for (const [runId, runInfo] of Object.entries(availableRuns)) { + if (!runInfo.changelog) continue; + const matches = runInfo.changelog.entries.some((entry) => + entry.config_keys.some((key) => { + const parts = key.split('-'); + return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!); + }), + ); + if (matches) runIdsWithModelChangelog.push(runId); + } + } + const benchmarkRunId = + selectedRunId && + runIdsWithModelChangelog.length > 1 && + runIdsWithModelChangelog.includes(selectedRunId) + ? String(selectedRunId) + : undefined; + const { graphs, loading: chartDataLoading, @@ -208,7 +282,9 @@ export function InferenceProvider({ effectiveRunDate, isActive, latestDate, + selectedPercentile, compareGpuPair ?? null, + benchmarkRunId, ); // For GPU comparison date picker — use shared availability data from global filters @@ -222,7 +298,7 @@ export function InferenceProvider({ if (!availabilityRows) return availableDates; const rows = availabilityRows.filter((r) => { if (!dbModelKeys.includes(r.model)) return false; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false; + if (rowToSequence(r) !== effectiveSequence) return false; if (!effectivePrecisions.includes(r.precision)) return false; if (!r.hardware) return false; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -247,7 +323,7 @@ export function InferenceProvider({ const hwKeys = new Set(); for (const r of availabilityRows) { if (!dbModelKeys.includes(r.model)) continue; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue; + if (rowToSequence(r) !== effectiveSequence) continue; if (!effectivePrecisions.includes(r.precision)) continue; if (!r.hardware) continue; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -319,6 +395,61 @@ export function InferenceProvider({ setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev)); }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]); + // Reconcile the x-axis mode with the scenario kind: + // - On mount with no `i_xmode` URL param: snap to the kind's natural default + // (agentic → ttft, fixed → interactivity). The state itself was initialized + // to a SSR-stable constant so server and client render the same DOM; this + // effect fixes it up after hydration. + // - When the user later switches sequence kinds: snap to the new kind's + // natural default (the prior selection was for a different kind, so it + // doesn't carry over). + const lastSeqKindRef = useRef | null>(null); + useEffect(() => { + const kind = sequenceKind(effectiveSequence); + const isInitialMount = lastSeqKindRef.current === null; + const isAgenticOnlyMode = + selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps'; + // On a stale render where kind hasn't changed, bail unless the current + // mode is agentic-only and we just landed on a fixed-seq scenario — in + // that case force the snap so the chart doesn't try to plot trace-derived + // metrics against rows that have no trace_replay. + if (!isInitialMount && lastSeqKindRef.current === kind) { + if (kind === 'fixed-seq' && isAgenticOnlyMode) { + handleSetXAxisMode('interactivity'); + } + return; + } + lastSeqKindRef.current = kind; + if ( + isInitialMount && + xAxisModeFromUrlRef.current && + !(kind === 'fixed-seq' && isAgenticOnlyMode) + ) { + // URL-restored agentic-only mode on a fixed-seq sequence makes no sense + // — fall through to the default snap below. + return; + } + handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity'); + }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]); + + // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or + // agentic percentile changes. For fixed-seq the JSONB only carries + // median_* / p99_* (no p90_*), so the TTFT button there has to point at + // median_ttft — otherwise the chart goes blank. For agentic, we point at + // the user's chosen percentile so the dropdown actually drives the axis. + useEffect(() => { + const isAgentic = sequenceKind(effectiveSequence) === 'agentic'; + if (selectedXAxisMode === 'ttft') { + setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft'); + } else if (selectedXAxisMode === 'e2e') { + // null = use the chart-config natural x (median_e2el), which useChartData + // rewrites to _e2el for agentic via withPercentile(). + setSelectedE2eXAxisMetric(null); + } + // 'interactivity' mode renders the interactivity chart, which keys off + // selectedXAxisMetric (not the e2e one), so nothing to do here. + }, [selectedXAxisMode, effectiveSequence, selectedPercentile]); + // Ref guard: when true, filter changes don't clear the active preset. // FavoritePresetsDropdown sets this while applying a preset so its own // programmatic setter calls don't accidentally deactivate it. @@ -768,6 +899,7 @@ export function InferenceProvider({ useUrlStateSync( { i_metric: selectedYAxisMetric, + i_pctl: selectedPercentile, i_gpus: selectedGPUs.join(','), i_dates: selectedDates.join(','), i_dstart: selectedDateRange.startDate, @@ -778,6 +910,7 @@ export function InferenceProvider({ i_log: logScale ? '1' : '', i_xmetric: selectedXAxisMetric || '', i_e2e_xmetric: selectedE2eXAxisMetric || '', + i_xmode: selectedXAxisMode, i_scale: scaleType, i_legend: isLegendExpanded ? '' : '0', i_advlabel: useAdvancedLabels ? '1' : '', @@ -791,6 +924,7 @@ export function InferenceProvider({ selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, + selectedXAxisMode, scaleType, selectedGPUs, selectedDates, @@ -961,6 +1095,8 @@ export function InferenceProvider({ setSelectedXAxisMetric, selectedE2eXAxisMetric, setSelectedE2eXAxisMetric, + selectedXAxisMode, + setSelectedXAxisMode: handleSetXAxisMode, scaleType, setScaleType, loading, @@ -968,6 +1104,8 @@ export function InferenceProvider({ workflowInfo, selectedYAxisMetric, setSelectedYAxisMetric: setSelectedYAxisMetricAndClear, + selectedPercentile, + setSelectedPercentile, selectedGPUs, setSelectedGPUs: setSelectedGPUsAndClear, availableGPUs, @@ -1032,6 +1170,7 @@ export function InferenceProvider({ selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, + selectedXAxisMode, scaleType, selectedGPUs, selectedDates, diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx new file mode 100644 index 00000000..2e43b4fb --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -0,0 +1,488 @@ +'use client'; + +import Link from 'next/link'; +import { useRouter } from 'next/navigation'; +import { useState } from 'react'; +import { ArrowLeft } from 'lucide-react'; + +import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates'; +import { useRequestTimeline } from '@/hooks/api/use-request-timeline'; +import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; +import { + useTraceServerMetrics, + type PointMeta, + type QueueDepthPoint, + type TimeSeriesPoint, +} from '@/hooks/api/use-trace-server-metrics'; +import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; + +import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; +import { Distribution } from './distribution'; +import { ExpandableChart } from './expandable-chart'; +import { RequestTimelineView } from './request-timeline'; +import { SiblingNav, chipLabel } from './sibling-nav'; +import { + StackedAreaChart, + TimeSeriesChart, + cumulativeAverage, + rollingAverage, + sumSeries, +} from './time-series-chart'; + +interface Props { + id: number; +} + +const fmtPct = (v: number | null | undefined): string => + v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`; + +function MetaLine({ label, value }: { label: string; value: React.ReactNode }) { + return ( +
+ {label} + {value} +
+ ); +} + +function PointSummary({ meta }: { meta: PointMeta }) { + return ( +
+
+

+ Selected point + {meta.disagg ? ' · disagg' : ''} + {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''} +

+ {meta.run_url && ( + + GitHub Actions run → + + )} +
+
+ + + + + {meta.isl !== null && } + {meta.osl !== null && } +
+
+ ); +} + +/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */ +const CHART_SIZES = { + inline: { width: 720, height: 260 }, + expanded: { width: 1300, height: 520 }, +}; + +type DetailView = 'point' | 'timeline' | 'aggregates'; +const VIEW_OPTIONS: SegmentedToggleOption[] = [ + { value: 'point', label: 'Per-point', testId: 'detail-view-point' }, + { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' }, + { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' }, +]; + +/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */ +function toAggPoint( + sibling: { id: number; label: string }, + pct: { mean: number; p50: number; p75: number; p90: number; p99: number } | null | undefined, +): AggregatePoint { + const values: Partial> = {}; + if (pct) { + values.mean = pct.mean; + values.p50 = pct.p50; + values.p75 = pct.p75; + values.p90 = pct.p90; + values.p99 = pct.p99; + } + return { id: sibling.id, label: sibling.label, values }; +} + +export function AgenticPointDetail({ id }: Props) { + const router = useRouter(); + const histQuery = useTraceHistograms([id], true); + const metricsQuery = useTraceServerMetrics(id, true); + const siblingsQuery = useBenchmarkSiblings(id); + + const hist = histQuery.data?.[id]; + const metrics = metricsQuery.data; + const siblingsData = siblingsQuery.data; + + const [view, setView] = useState('point'); + // Fetch aggregates only when the aggregates view is active. Uses the full + // sibling set (across parallelism + concurrency configs) so each chart + // shows how the metric varies across the SKU. + const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; + const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); + // Per-request timeline fetched only when the timeline view is active. + const timelineQuery = useRequestTimeline(id, view === 'timeline'); + + return ( +
+
+ + · + + Inference chart + +
+ + {siblingsData ? ( + + ) : siblingsQuery.isLoading ? ( +
Loading SKU navigator…
+ ) : null} + + {metrics ? ( + + ) : metricsQuery.isLoading ? ( +
Loading point metadata…
+ ) : null} + + {metricsQuery.isError && ( +
+ Failed to load trace data for benchmark point #{id}. +
+ )} + {metricsQuery.data === null && !metricsQuery.isLoading && ( +
+ No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf + time-series capture, or its source artifacts have expired on GitHub. +
+ )} + +
+ + {view === 'aggregates' && ( + + {siblingIds.length} configs in SKU + {aggregatesQuery.isLoading ? ' · loading…' : ''} + + )} + {view === 'timeline' && timelineQuery.data && ( + + {timelineQuery.data.requests.length} requests + + )} +
+ + {view === 'aggregates' ? ( + + ) : view === 'timeline' ? ( + timelineQuery.isLoading ? ( +
+ Loading request timeline… +
+ ) : timelineQuery.data ? ( + + ) : ( +
+ No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact + isn't stored for this row. +
+ ) + ) : ( +
+ { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (hist) return ; + return histQuery.isLoading ? : ; + }} + /> + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (hist) return ; + return histQuery.isLoading ? : ; + }} + /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="KV cache (%)" + {...size} + /> + ); + }} + /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + ({ + t: p.t, + value: p.running, + })), + 50, + ), + color: '#22c55e', + strokeWidth: 2, + }, + { + name: 'Waiting (avg n=50)', + data: rollingAverage( + metrics.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.waiting, + })), + 50, + ), + color: '#ef4444', + strokeWidth: 2, + }, + { + name: 'Total (avg n=50)', + data: rollingAverage( + metrics.queueDepth.map((p: QueueDepthPoint) => ({ + t: p.t, + value: p.total, + })), + 50, + ), + color: '#3b82f6', + strokeWidth: 2, + }, + ]} + durationS={metrics.durationS} + yAxisLabel="Requests" + {...size} + /> + ); + }} + /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + `${(v * 100).toFixed(0)}%`} + yAxisLabel="Hit rate (%)" + {...size} + /> + ); + }} + /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + const total = sumSeries(metrics.prefillTps, metrics.decodeTps); + return ( + + ); + }} + /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + + ); + }} + /> +
+ )} +
+ ); +} + +function AggregatesGrid({ + siblings, + aggregates, + isLoading, +}: { + siblings: { + id: number; + conc: number; + decode_tp: number; + decode_ep: number; + disagg: boolean; + num_prefill_gpu: number; + num_decode_gpu: number; + offload_mode?: string | null; + }[]; + aggregates: AgenticAggregateMap | undefined; + isLoading: boolean; +}) { + if (siblings.length === 0) { + return ( +
+ SKU sibling list not loaded yet — open a point to populate. +
+ ); + } + if (isLoading && !aggregates) { + return ( +
+ Computing aggregates across {siblings.length} configs… (parsing trace blobs) +
+ ); + } + const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s as any) })); + const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl)); + const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl)); + const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil)); + const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate)); + return ( +
+ ( + + )} + /> + ( + + )} + /> + ( + `${(v * 100).toFixed(0)}%`} + {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)} + /> + )} + /> + ( + `${(v * 100).toFixed(0)}%`} + {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)} + /> + )} + /> +
+ ); +} + +function Skeleton() { + return
; +} + +function Empty() { + return ( +
No data
+ ); +} + +// Re-export type for use by sub-components +export type { TimeSeriesPoint, QueueDepthPoint }; diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx new file mode 100644 index 00000000..55ac8061 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx @@ -0,0 +1,286 @@ +'use client'; + +import { useMemo } from 'react'; + +import { ChartHover, type HoverItem } from './chart-hover'; + +export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99'; + +interface PercentileLine { + key: PercentileKey; + /** Display label in legend / tooltip. */ + label: string; + color: string; +} + +const PERCENTILE_LINES: PercentileLine[] = [ + { key: 'mean', label: 'Mean', color: '#ef4444' }, + { key: 'p50', label: 'P50', color: '#3b82f6' }, + { key: 'p75', label: 'P75', color: '#22c55e' }, + { key: 'p90', label: 'P90', color: '#f59e0b' }, + { key: 'p99', label: 'P99', color: '#a855f7' }, +]; + +export interface AggregatePoint { + /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */ + label: string; + /** Per-percentile value; missing percentiles are dropped from the plot. */ + values: Partial>; + /** Sibling id — purely informational, used in the tooltip title. */ + id?: number; +} + +/** + * Multi-line chart: one x-position per sibling config, one line per + * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across + * configs" view on the agentic detail page. + */ +export function AggregateChart({ + points, + unit, + yMax, + yFmt, + width = 720, + height = 320, +}: { + points: readonly AggregatePoint[]; + unit: string; + /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */ + yMax?: number; + /** Optional value formatter (e.g. percentage → "30%"). */ + yFmt?: (v: number) => string; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + const PAD = { top: 16, right: 16, bottom: 90, left: 64 }; + const fmt = (v: number) => + yFmt + ? yFmt(v) + : v >= 10000 + ? new Intl.NumberFormat('en-US').format(Math.round(v)) + : v.toFixed(v < 10 ? 2 : 0); + + const computed = useMemo(() => { + if (points.length === 0) return null; + let yMaxComputed = 0; + for (const p of points) { + for (const line of PERCENTILE_LINES) { + const v = p.values[line.key]; + if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v; + } + } + const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + return { yTop, innerW, innerH }; + }, [points, W, H, PAD.left, PAD.right, PAD.top, PAD.bottom, yMax]); + + if (!computed) { + return ( +
+ No data +
+ ); + } + const { yTop, innerW, innerH } = computed; + + // X positions: evenly spaced across the inner width. + const xOf = (i: number) => + points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW; + const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH; + + // 5 y-axis ticks evenly between 0 and yTop. + const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4); + + // Resolve hover: snap to nearest sibling index and emit all percentiles + // that have data at that x. + const resolve = (fraction: number) => { + const idx = Math.round(fraction * (points.length - 1)); + const p = points[Math.max(0, Math.min(points.length - 1, idx))]; + if (!p) return null; + const items: HoverItem[] = []; + for (const line of PERCENTILE_LINES) { + const v = p.values[line.key]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + items.push({ color: line.color, label: line.label, value: fmt(v) }); + } + return { items, title: p.label }; + }; + + return ( +
+
+ {PERCENTILE_LINES.map((line) => ( +
+ + {line.label} +
+ ))} + + {points.length} configs · units: {unit} + +
+ + {/* y-axis ticks + gridlines */} + {yTicks.map((v, i) => { + const y = yOf(v); + return ( + + + + {fmt(v)} + + + ); + })} + + {/* X-axis tick labels — one per sibling, rotated 30° to fit. */} + {points.map((p, i) => { + const x = xOf(i); + return ( + + + + {p.label} + + + ); + })} + + {/* X axis baseline */} + + + {/* Horizontal connecting lines per percentile — faint backdrop so the + eye can follow how each percentile changes across configs. */} + {PERCENTILE_LINES.map((line) => { + const segments: { x1: number; y1: number; x2: number; y2: number }[] = []; + let prev: { x: number; y: number } | null = null; + for (let i = 0; i < points.length; i++) { + const v = points[i]!.values[line.key]; + if (typeof v !== 'number' || !Number.isFinite(v)) { + prev = null; + continue; + } + const x = xOf(i); + const y = yOf(v); + if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y }); + prev = { x, y }; + } + return ( + + {segments.map((s, j) => ( + + ))} + + ); + })} + + {/* Per-sibling vertical bar spanning the percentile range, with a + colored tick at each percentile level. Mean rendered as a small + diamond to distinguish from the percentile ticks. */} + {points.map((p, i) => { + const x = xOf(i); + // Collect percentile values present for this sibling. + const present = PERCENTILE_LINES.filter( + (line) => + typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!), + ).map((line) => ({ ...line, value: p.values[line.key]! })); + if (present.length === 0) return null; + // Only the *percentile* values define the bar extent; mean might be + // outside the percentile span on weird distributions. + const pctlOnly = present.filter((p2) => p2.key !== 'mean'); + const bandValues = pctlOnly.length > 0 ? pctlOnly : present; + const bandYs = bandValues.map((b) => yOf(b.value)); + const yLo = Math.min(...bandYs); + const yHi = Math.max(...bandYs); + return ( + + + {present.map((b) => { + const ty = yOf(b.value); + if (b.key === 'mean') { + // Diamond marker for mean. + const s = 4; + return ( + + ); + } + // Horizontal tick at each percentile. + return ( + + ); + })} + + ); + })} + +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx new file mode 100644 index 00000000..24270122 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx @@ -0,0 +1,148 @@ +'use client'; + +import { useState, type ReactNode } from 'react'; + +/** Vertical crosshair + floating value tooltip overlay shared by every chart. */ +export interface HoverItem { + /** Color swatch to render next to the label. */ + color: string; + label: string; + value: string; + /** Optional faint secondary line (e.g. timestamp under main values). */ + hint?: string; +} + +interface ChartHoverProps { + /** Padding inside the SVG; matches the chart's CHART_PAD. */ + pad: { top: number; right: number; bottom: number; left: number }; + /** SVG viewBox dimensions used to render the chart. */ + width: number; + height: number; + /** + * Called with the cursor's normalized x in [0..1] across the plot area. + * Returns `null` to hide the tooltip (e.g. cursor outside data range). + */ + resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null; + children: ReactNode; +} + +/** + * Wrap a chart's render to add mouse-driven crosshair + tooltip. + * + * The chart owner renders its bars / lines / axes via `children`; this wrapper + * adds an invisible across the plot area to capture pointer events, a + * vertical line that follows the cursor, and a floating tooltip on the right + * of the cursor (auto-flipping to the left when it would overflow). + */ +export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) { + const [hover, setHover] = useState<{ + xPx: number; + yPx: number; + fraction: number; + items: HoverItem[]; + title?: string; + } | null>(null); + + const innerW = width - pad.left - pad.right; + const innerH = height - pad.top - pad.bottom; + + const onMove = (e: React.MouseEvent) => { + const svg = e.currentTarget.ownerSVGElement; + if (!svg) return; + const rect = svg.getBoundingClientRect(); + // Convert client coords → SVG viewBox coords. + const sx = ((e.clientX - rect.left) * width) / rect.width; + const sy = ((e.clientY - rect.top) * height) / rect.height; + const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW)); + const resolved = resolve(fraction); + if (!resolved) { + setHover(null); + return; + } + setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title }); + }; + + const onLeave = () => setHover(null); + + return ( +
+ + {children} + {hover && ( + + )} + + + {hover && hover.items.length > 0 && ( + + )} +
+ ); +} + +function HoverTooltip({ + xFraction, + containerWidth, + padLeft, + innerW, + title, + items, +}: { + xFraction: number; + containerWidth: number; + padLeft: number; + innerW: number; + title?: string; + items: HoverItem[]; +}) { + // Position tooltip near the crosshair as a % of the container. + // We flip to the cursor's left side when it would overflow the right edge. + const xPx = padLeft + xFraction * innerW; + const onRight = xPx < containerWidth * 0.55; + const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto'; + const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`; + return ( +
+ {title &&
{title}
} + {items.map((it, i) => ( +
+ + {it.label} + {it.value} +
+ ))} +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx new file mode 100644 index 00000000..685b73f3 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/distribution.tsx @@ -0,0 +1,242 @@ +'use client'; + +import { useMemo } from 'react'; + +import { ChartHover, type HoverItem } from './chart-hover'; + +const fmtNum = (n: number) => + n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n)); + +/** + * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the + * detail-page card — fills its container width via `viewBox` + 100% width. + * Hover shows the bin range + count + cumulative percentile. + */ +export function Distribution({ + values, + unit, + width = 720, + height = 260, +}: { + values: readonly number[]; + unit: string; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + const PAD = { top: 12, right: 16, bottom: 56, left: 60 }; + + const computed = useMemo(() => { + if (values.length === 0) return null; + const sorted = [...values].toSorted((a, b) => a - b); + const min = sorted[0]!; + const max = sorted.at(-1)!; + const range = Math.max(1e-9, max - min); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length)))); + const counts: number[] = Array.from({ length: nBins }, () => 0); + for (const v of values) { + const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins)); + counts[i]!++; + } + return { sorted, min, max, range, innerW, innerH, nBins, counts }; + }, [values, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]); + + if (!computed) { + return ( +
No data
+ ); + } + const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed; + const maxCount = Math.max(...counts, 1); + const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW; + const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH; + const barW = innerW / nBins; + + const fmt = fmtNum; + + const quantile = (q: number): number => { + const pos = (sorted.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo); + }; + + const GUIDES = [ + { label: 'p50', q: 0.5, color: '#3b82f6' }, + { label: 'p75', q: 0.75, color: '#22c55e' }, + { label: 'p90', q: 0.9, color: '#f59e0b' }, + { label: 'p95', q: 0.95, color: '#ef4444' }, + ] as const; + + // Hover: report the bin range under cursor, its count, and what percentile + // the bin's midpoint represents in the empirical distribution. + const resolve = (fraction: number) => { + const v = min + fraction * range; + const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins)); + const binLo = min + (binIdx * range) / nBins; + const binHi = min + ((binIdx + 1) * range) / nBins; + const count = counts[binIdx] ?? 0; + // Cumulative % at the bin's right edge. + let cumCount = 0; + for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0; + const cumPct = (cumCount / values.length) * 100; + const items: HoverItem[] = [ + { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` }, + { color: 'currentColor', label: 'Count', value: count.toLocaleString() }, + { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` }, + ]; + return { items }; + }; + + const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max]; + const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4); + + return ( +
+
+ {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit} +
+ + {/* y-axis gridlines + labels */} + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {fmt(v)} + + + ); + })} + + {/* Bars */} + {counts.map((c, i) => { + const h = (c / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* Percentile guide lines */} + {GUIDES.map(({ q, color }) => { + const v = quantile(q); + const x = xScale(v); + return ( + + ); + })} + + {/* X axis */} + + {xTickVals.map((v, i) => { + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmt(v)} + + ); + })} + + value ({unit}) + + + count + + + {/* Percentile legend chips */} + {(() => { + const chipY = H - 8; + const chipW = innerW / GUIDES.length; + return GUIDES.map(({ label: ql, q, color }, i) => { + const v = quantile(q); + const x = PAD.left + i * chipW; + return ( + + + + {ql} {fmt(v)} + + + ); + }); + })()} + +
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx new file mode 100644 index 00000000..7c8e4538 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx @@ -0,0 +1,46 @@ +'use client'; + +import { useState, type ReactNode } from 'react'; +import { Maximize2 } from 'lucide-react'; + +import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog'; + +/** + * Wraps a chart in a card with a header + expand button. Click the button to + * open the chart in a large dialog. The `render` prop receives `expanded:true` + * inside the dialog so charts can pick larger width/height. + */ +export function ExpandableChart({ + title, + render, +}: { + title: string; + render: (expanded: boolean) => ReactNode; +}) { + const [open, setOpen] = useState(false); + + return ( +
+
+

{title}

+ +
+ {render(false)} + + + + {title} + +
{render(true)}
+
+
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx new file mode 100644 index 00000000..bcbe105a --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -0,0 +1,821 @@ +'use client'; + +import { useCallback, useMemo, useRef, useState } from 'react'; + +import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline'; +import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; + +/** + * Gantt-style request timeline for one agentic benchmark point. + * + * Rows are conversations (or workers — toggle in the header). Bars are + * individual HTTP requests, drawn from request_start to request_end with a + * thin lead-in segment from credit_issued (load gen queue). Scroll-wheel + * zooms, drag pans, hover shows per-request stats. + * + * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy. + */ + +type RowMode = 'conversation' | 'worker'; + +const ROW_MODE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' }, + { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' }, +]; + +type PhaseFilter = 'all' | 'profiling'; + +const PHASE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' }, + { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' }, +]; + +/** A stable color palette indexed by row-key hash. */ +const ROW_COLORS = [ + '#3b82f6', + '#ef4444', + '#10b981', + '#f59e0b', + '#a855f7', + '#06b6d4', + '#f97316', + '#84cc16', + '#ec4899', + '#14b8a6', + '#8b5cf6', + '#eab308', +]; + +/** Phase color overlay drawn as a thin strip at the bottom of each bar. */ +const PHASE_COLORS: Record = { + profiling: '#22c55e', + warmup: '#94a3b8', + unknown: '#64748b', +}; + +interface Row { + key: string; + label: string; + color: string; + requests: RequestRecord[]; + /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */ + depth: number; + /** True if this row is a sub-agent ("Subagent N of parent X"). */ + isSubagent: boolean; +} + +/** + * Conversation ids for subagent calls look like + * ::sa:subagent__ + * Split into the parent cid and a sub-agent label (or the whole thing if + * this is a top-level conversation). + */ +function splitCid(cid: string): { parent: string; subagent: string | null } { + const sep = cid.indexOf('::sa:'); + if (sep === -1) return { parent: cid, subagent: null }; + return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) }; +} + +/** Group requests into rows; in conversation mode subagents nest under parents. */ +function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { + const groups = new Map(); + for (const r of requests) { + const key = mode === 'conversation' ? r.cid : r.wid; + let list = groups.get(key); + if (!list) { + list = []; + groups.set(key, list); + } + list.push(r); + } + + if (mode !== 'conversation') { + // Worker mode: flat rows, sorted by first activity. + const rows: Row[] = []; + let i = 0; + for (const [key, list] of groups) { + list.sort((a, b) => a.start - b.start); + rows.push({ + key, + label: shortenWid(key), + color: ROW_COLORS[i % ROW_COLORS.length]!, + requests: list, + depth: 0, + isSubagent: false, + }); + i++; + } + rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start); + return rows; + } + + // Conversation mode: build a parent → [subagents] tree so each parent + // group renders as one parent row followed by its sub-agent rows. Color + // is shared inside a tree so the visual grouping reads. + interface Tree { + parentCid: string; + parentRow: { key: string; requests: RequestRecord[] } | null; + subagents: Map; // subagent label → requests + firstStart: number; + } + const trees = new Map(); + for (const [cid, list] of groups) { + list.sort((a, b) => a.start - b.start); + const { parent, subagent } = splitCid(cid); + let tree = trees.get(parent); + if (!tree) { + tree = { + parentCid: parent, + parentRow: null, + subagents: new Map(), + firstStart: Number.POSITIVE_INFINITY, + }; + trees.set(parent, tree); + } + if (subagent === null) { + tree.parentRow = { key: cid, requests: list }; + } else { + tree.subagents.set(subagent, list); + } + const earliest = list[0]!.start; + if (earliest < tree.firstStart) tree.firstStart = earliest; + } + + const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); + const rows: Row[] = []; + let colorIdx = 0; + for (const tree of sortedTrees) { + const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; + colorIdx++; + if (tree.parentRow) { + rows.push({ + key: tree.parentRow.key, + label: shortenCid(tree.parentCid), + color, + requests: tree.parentRow.requests, + depth: 0, + isSubagent: false, + }); + } else { + // Pseudo-parent header so orphan subagents still render under + // something they belong to. + rows.push({ + key: `__parent_${tree.parentCid}`, + label: shortenCid(tree.parentCid), + color, + requests: [], + depth: 0, + isSubagent: false, + }); + } + const subagentEntries = [...tree.subagents.entries()].toSorted( + (a, b) => a[1][0]!.start - b[1][0]!.start, + ); + for (const [saLabel, list] of subagentEntries) { + rows.push({ + key: `${tree.parentCid}::${saLabel}`, + label: `↳ ${formatSubagentLabel(saLabel)}`, + color, + requests: list, + depth: 1, + isSubagent: true, + }); + } + } + return rows; +} + +/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */ +function formatSubagentLabel(raw: string): string { + const m = /^subagent_(\d+)_([0-9a-f]+)$/i.exec(raw); + if (!m) return raw; + return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`; +} + +function shortenCid(cid: string): string { + if (cid.length <= 12) return cid; + return `${cid.slice(0, 8)}…${cid.slice(-4)}`; +} + +function shortenWid(wid: string): string { + // worker_4ae87bea → w_4ae8 + return wid.replace(/^worker_/, 'w_').slice(0, 12); +} + +/** Format ns offset → "+12.3s" / "+1.2m". */ +function formatTickLabel(ns: number): string { + const ms = ns / 1e6; + if (ms < 1000) return `+${ms.toFixed(0)}ms`; + if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`; + return `+${(ms / 60_000).toFixed(1)}m`; +} + +function formatDuration(ms: number): string { + if (ms < 1000) return `${ms.toFixed(0)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`; + return `${(ms / 60_000).toFixed(2)}m`; +} + +/** Number of values in a sorted ascending array that are <= target. */ +function countLeq(sorted: number[], target: number): number { + let lo = 0; + let hi = sorted.length; + while (lo < hi) { + const mid = (lo + hi) >>> 1; + if (sorted[mid]! <= target) lo = mid + 1; + else hi = mid; + } + return lo; +} +/** Number of values in a sorted ascending array that are < target. */ +function countLt(sorted: number[], target: number): number { + let lo = 0; + let hi = sorted.length; + while (lo < hi) { + const mid = (lo + hi) >>> 1; + if (sorted[mid]! < target) lo = mid + 1; + else hi = mid; + } + return lo; +} + +interface TooltipData { + x: number; + y: number; + row: Row; + req: RequestRecord; +} + +function Tooltip({ data }: { data: TooltipData }) { + const { row, req } = data; + const totalMs = (req.end - req.start) / 1e6; + const queueMs = (req.start - req.credit) / 1e6; + return ( +
+
+ + {row.label} + · turn {req.ti} + {req.cancelled && · cancelled} +
+
+ Total + {formatDuration(totalMs)} + Queue wait + + {queueMs > 0.5 ? formatDuration(queueMs) : '—'} + + {req.ttftMs !== null && ( + <> + TTFT + + {formatDuration(req.ttftMs)} + + + )} + {req.isl !== null && ( + <> + ISL + + {req.isl.toLocaleString()} + + + )} + {req.osl !== null && ( + <> + OSL + + {req.osl.toLocaleString()} + + + )} + Phase + {req.phase} + {req.ad > 0 && ( + <> + Agent depth + {req.ad} + + )} + Worker + {shortenWid(req.wid)} +
+
+ Started at {formatTickLabel(req.start)} +
+
+ ); +} + +export function RequestTimelineView({ data }: { data: RequestTimeline }) { + const [rowMode, setRowMode] = useState('conversation'); + const [phaseFilter, setPhaseFilter] = useState('profiling'); + const [tooltip, setTooltip] = useState(null); + const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); + + // Apply phase filter, then group into rows. + const filtered = useMemo( + () => + phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), + [data.requests, phaseFilter], + ); + const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]); + + // Pre-sort the timestamp columns so the cursor-time stats popover can + // count "running / waiting at time t" in O(log n). With a few hundred + // requests this is overkill — but it stays smooth on huge runs too. + const sortedTimes = useMemo(() => { + const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b); + const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b); + const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b); + return { credits, starts, ends }; + }, [filtered]); + + // Cursor state (vertical line + stats popover). null when the mouse + // isn't over the chart. xPx is svg-local; tNs is the ns offset from + // dataStart that the cursor is pointing at. + const [cursor, setCursor] = useState<{ + xPx: number; + tNs: number; + clientX: number; + clientY: number; + } | null>(null); + + // Timeline extent (clamped to actual data — if we filtered out warmup + // the visible window should shrink to just the profiling phase). + const dataStart = filtered.length === 0 ? 0 : Math.min(...filtered.map((r) => r.credit)); + const dataEnd = filtered.length === 0 ? 1 : Math.max(...filtered.map((r) => r.end)); + const totalNs = Math.max(dataEnd - dataStart, 1); + + // Visible window state (ns offsets, relative to dataStart). + const [viewStart, setViewStart] = useState(0); + const [viewEnd, setViewEnd] = useState(null); + const vStart = viewStart; + const vEnd = viewEnd ?? totalNs; + const visibleDur = Math.max(vEnd - vStart, 1); + const isZoomed = viewEnd !== null; + + // Layout + const LABEL_WIDTH = 160; + const ROW_HEIGHT = 22; + const ROW_GAP = 3; + const HEADER_HEIGHT = 24; + const PADDING_RIGHT = 12; + const chartWidth = 920; + const svgHeight = HEADER_HEIGHT + rows.length * (ROW_HEIGHT + ROW_GAP) + 6; + const scale = (chartWidth - PADDING_RIGHT) / visibleDur; + // Local coords: convert ns offset from dataStart to x px. + const xOf = (ns: number) => (ns - dataStart - vStart) * scale; + + // Time-axis ticks (~8 across visible window, snapped to nice second multiples). + const niceMs = [ + 100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000, + ]; + const targetMs = visibleDur / 1e6 / 8; + const tickMs = niceMs.find((n) => n >= targetMs) ?? targetMs; + const tickNs = tickMs * 1e6; + const ticks: number[] = []; + const tickStart = Math.floor(vStart / tickNs) * tickNs; + for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) { + if (t >= vStart && t <= vEnd) ticks.push(t); + } + + const handleWheel = useCallback( + (e: React.WheelEvent) => { + e.preventDefault(); + const rect = e.currentTarget.getBoundingClientRect(); + const mouseX = e.clientX - rect.left; + const mouseRatio = Math.max(0, Math.min(1, mouseX / (chartWidth - PADDING_RIGHT))); + const curStart = vStart; + const curEnd = vEnd; + const curDur = curEnd - curStart; + const factor = e.deltaY > 0 ? 1.2 : 1 / 1.2; + const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs); + const pivot = curStart + mouseRatio * curDur; + let newStart = pivot - mouseRatio * newDur; + let newEnd = pivot + (1 - mouseRatio) * newDur; + if (newStart < 0) { + newEnd -= newStart; + newStart = 0; + } + if (newEnd > totalNs) { + newStart -= newEnd - totalNs; + newEnd = totalNs; + if (newStart < 0) newStart = 0; + } + if (newEnd - newStart >= totalNs * 0.99) { + setViewStart(0); + setViewEnd(null); + } else { + setViewStart(newStart); + setViewEnd(newEnd); + } + }, + [vStart, vEnd, totalNs, chartWidth], + ); + + const handleMouseDown = useCallback( + (e: React.MouseEvent) => { + if (e.button !== 0) return; + dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd }; + }, + [vStart, vEnd], + ); + + const handleMouseMove = useCallback( + (e: React.MouseEvent) => { + // Dragging takes precedence over cursor tracking — panning the view. + if (dragRef.current) { + const dx = e.clientX - dragRef.current.startX; + const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT); + const delta = -dx * nsPerPx; + let ns = dragRef.current.vs + delta; + let ne = dragRef.current.ve + delta; + const dur = ne - ns; + if (ns < 0) { + ns = 0; + ne = dur; + } + if (ne > totalNs) { + ne = totalNs; + ns = totalNs - dur; + if (ns < 0) ns = 0; + } + setViewStart(ns); + setViewEnd(ne); + setTooltip(null); + setCursor(null); + return; + } + // Track the cursor position in svg-local px and the matching ns offset + // so the crosshair + stats popover can render. Clamped to the chart + // plot area (don't show a cursor on the axis labels gutter). + const rect = e.currentTarget.getBoundingClientRect(); + const xPx = Math.max(0, Math.min(chartWidth - PADDING_RIGHT, e.clientX - rect.left)); + const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT); + const tNs = vStart + xPx * nsPerPx; + setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY }); + }, + [visibleDur, chartWidth, totalNs, vStart], + ); + + const handleMouseUp = useCallback(() => { + dragRef.current = null; + }, []); + + const handleMouseLeave = useCallback(() => { + dragRef.current = null; + setCursor(null); + }, []); + + const resetZoom = useCallback(() => { + setViewStart(0); + setViewEnd(null); + }, []); + + if (rows.length === 0) { + return ( +
+ No requests in the current filter. +
+ ); + } + + const totalRequests = filtered.length; + + return ( +
+ {/* Controls */} +
+ + + + {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '} + {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '} + {formatDuration((dataEnd - dataStart) / 1e6)} + {isZoomed && ( + <> + {' · '} + + + )} + +
+ + {/* Chart container */} +
+
+ {/* Label column — sticky, doesn't scroll horizontally with the chart. */} +
+
+ + {rowMode === 'conversation' ? 'Conversation' : 'Worker'} + +
+ {rows.map((row) => ( +
+ + + {row.label} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
+ ))} +
+ + {/* Scrollable SVG */} +
+ + {/* Header / time-axis baseline */} + + + {/* Time axis ticks */} + {ticks.map((t) => { + // Convert visible-window ns offset → x px (the tick array + // is already in dataStart-relative coords). + const x = (t - vStart) * scale; + return ( + + + + {formatTickLabel(t)} + + + ); + })} + + {/* Row separators */} + {rows.map((row, idx) => ( + + ))} + + {/* Request bars */} + {rows.map((row, rowIdx) => { + const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; + const barH = ROW_HEIGHT - 4; + return row.requests.map((req) => { + const xCredit = xOf(req.credit); + const xStart = xOf(req.start); + const xEnd = xOf(req.end); + // Cull bars entirely outside the visible window so big + // benchmarks don't render thousands of zero-width rects. + if (xEnd < -2 || xCredit > chartWidth + 2) return null; + const runW = Math.max(xEnd - xStart, 1); + const queueW = Math.max(xStart - xCredit, 0); + const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + return ( + setTooltip({ x: e.clientX, y: e.clientY, row, req })} + onMouseLeave={() => setTooltip(null)} + > + {/* Queue lead-in (faint) — only drawn when noticeable. */} + {queueW >= 1 && ( + + )} + {/* Main bar */} + + {/* Phase strip at bottom */} + + {/* Cancelled X overlay */} + {req.cancelled && runW > 6 && ( + + )} + + ); + }); + })} + + {/* Cursor crosshair — drawn on top of bars so it stays visible + through dense rows. Stats popover is rendered as fixed + HTML below the SVG block. */} + {cursor && ( + + )} + +
+
+
+ + {/* Footer / legend */} +
+ + + queue wait + + + + profiling + + + + warmup + + scroll to zoom · drag to pan +
+ + {/* Cursor stats popover: count of in-flight / waiting at the cursor's + ns offset. Hidden when the user is hovering an individual bar + (per-request tooltip wins). */} + {cursor && !tooltip && ( + + )} + + {/* Tooltip */} + {tooltip && } +
+ ); +} + +function CursorPopover({ + cursor, + dataStart, + startTimes, + endTimes, + creditTimes, +}: { + cursor: { xPx: number; tNs: number; clientX: number; clientY: number }; + dataStart: number; + startTimes: number[]; + endTimes: number[]; + creditTimes: number[]; +}) { + // At time t (ns from dataStart, here represented as t = tNs): + // running = #(start <= t) - #(end < t) + // waiting = #(credit <= t) - #(start <= t) + // completed= #(end <= t) + const t = cursor.tNs; + const startsLeq = countLeq(startTimes, t); + const endsLt = countLt(endTimes, t); + const creditsLeq = countLeq(creditTimes, t); + const endsLeq = countLeq(endTimes, t); + const running = Math.max(0, startsLeq - endsLt); + const waiting = Math.max(0, creditsLeq - startsLeq); + const completed = endsLeq; + const inflight = running + waiting; + // Absolute wall-clock seconds since the timeline origin (dataStart). + const tSec = t / 1e9; + // Position the popover near the cursor without overflowing the viewport. + // 200 px wide; flip to the left of the cursor if it would clip the right. + const wantLeft = cursor.clientX + 14; + const left = + typeof window === 'undefined' || wantLeft + 220 < window.innerWidth + ? wantLeft + : cursor.clientX - 220; + return ( +
+
+ t = + + {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`} + +
+
+ In flight + {inflight} + running + {running} + waiting + {waiting} + Completed + {completed} +
+ {/* dataStart is informational — the displayed t is relative to it. */} +
+ relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock) +
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx new file mode 100644 index 00000000..aa727fdc --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -0,0 +1,118 @@ +'use client'; + +import { useRouter } from 'next/navigation'; +import { ChevronLeft, ChevronRight } from 'lucide-react'; + +import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; + +const HW_LABELS: Record = { + b200: 'B200', + b300: 'B300', + gb200: 'GB200', + gb300: 'GB300', + h100: 'H100', + h200: 'H200', + mi300x: 'MI300X', + mi325x: 'MI325X', + mi355x: 'MI355X', +}; + +const MODEL_LABELS: Record = { + dsr1: 'DeepSeek R1', + dsv4: 'DeepSeek V4 Pro', + glm5: 'GLM-5', + 'glm5.1': 'GLM-5.1', + gptoss120b: 'gpt-oss 120B', + kimik2: 'Kimi K2', + 'kimik2.5': 'Kimi K2.5', + 'kimik2.6': 'Kimi K2.6', + llama70b: 'Llama 3.3 70B', + 'minimaxm2.5': 'MiniMax M2.5', + 'minimaxm2.7': 'MiniMax M2.7', + 'qwen3.5': 'Qwen 3.5', +}; + +function hwLabel(hw: string) { + return HW_LABELS[hw] ?? hw.toUpperCase(); +} +function modelLabel(m: string) { + return MODEL_LABELS[m] ?? m; +} +function frameworkLabel(fw: string) { + if (fw === 'vllm') return 'vLLM'; + if (fw === 'sglang') return 'SGLang'; + if (fw === 'trt') return 'TRT'; + if (fw === 'mori-sglang') return 'Mori-SGLang'; + if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`; + return fw; +} + +/** Short label for a sibling chip: parallelism + concurrency. */ +export function chipLabel(s: BenchmarkSibling): string { + const parallel = s.disagg + ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D` + : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`; + const offload = s.offload_mode === 'on' ? ' • off=ON' : ''; + return `${parallel} • c=${s.conc}${offload}`; +} + +export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) { + const router = useRouter(); + const currentIdx = siblings.findIndex((s) => s.is_current); + const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null; + const next = + currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null; + + const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`; + + return ( +
+
+

{skuLabel}

+ + {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date} + +
+
+ +
+ {siblings.map((s) => { + const active = s.is_current; + return ( + + ); + })} +
+ +
+
+ ); +} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx new file mode 100644 index 00000000..cd10aff7 --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -0,0 +1,496 @@ +'use client'; + +import { useMemo } from 'react'; + +import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics'; + +import { ChartHover, type HoverItem } from './chart-hover'; + +interface Series { + name: string; + /** The line to draw (caller pre-smooths if desired). */ + data: TimeSeriesPoint[]; + /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */ + rawData?: TimeSeriesPoint[]; + color: string; + /** Override default stroke width (1.8). Use higher values for emphasis lines. */ + strokeWidth?: number; +} + +interface TimeSeriesChartProps { + series: Series[]; + durationS: number; + yMax?: number; + yFmt?: (v: number) => string; + yAxisLabel?: string; + width?: number; + height?: number; +} + +/** Centered rolling average over `windowSize` samples. */ +export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] { + if (data.length === 0 || windowSize <= 1) return data; + const half = Math.floor(windowSize / 2); + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const start = Math.max(0, i - half); + const end = Math.min(data.length, i + half + 1); + let sum = 0; + let n = 0; + for (let j = start; j < end; j++) { + sum += data[j]!.value; + n++; + } + out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 }; + } + return out; +} + +/** Expanding-window cumulative mean from index 0..i. */ +export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + let sum = 0; + for (let i = 0; i < data.length; i++) { + sum += data[i]!.value; + out[i] = { t: data[i]!.t, value: sum / (i + 1) }; + } + return out; +} + +/** Pointwise sum of two arrays sharing the same t index. */ +export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { + const n = Math.min(a.length, b.length); + const out: TimeSeriesPoint[] = Array.from({ length: n }); + for (let i = 0; i < n; i++) { + out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value }; + } + return out; +} + +const fmtIntDefault = (n: number) => + n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n)); + +const fmtSeconds = (s: number) => { + if (s < 60) return `${Math.round(s)}s`; + const m = Math.floor(s / 60); + const rem = Math.round(s % 60); + return `${m}m ${rem}s`; +}; + +/** Linear-interpolated value at time `t` from a time-sorted series. */ +function interpAt(data: TimeSeriesPoint[], t: number): number | null { + if (data.length === 0) return null; + if (t <= data[0]!.t) return data[0]!.value; + if (t >= data.at(-1)!.t) return data.at(-1)!.value; + // Binary search + let lo = 0; + let hi = data.length - 1; + while (hi - lo > 1) { + const mid = (lo + hi) >> 1; + if (data[mid]!.t <= t) lo = mid; + else hi = mid; + } + const a = data[lo]!; + const b = data[hi]!; + if (b.t === a.t) return a.value; + const frac = (t - a.t) / (b.t - a.t); + return a.value + (b.value - a.value) * frac; +} + +export function TimeSeriesChart({ + series, + durationS, + yMax: yMaxOpt, + yFmt = fmtIntDefault, + yAxisLabel, + width = 720, + height = 260, +}: TimeSeriesChartProps) { + const W = width; + const H = height; + const PAD = { top: 12, right: 16, bottom: 56, left: 60 }; + + const layout = useMemo(() => { + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const xMax = Math.max(durationS, 1); + const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value))); + const xScale = (t: number) => PAD.left + (t / xMax) * innerW; + const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH; + return { innerW, innerH, xMax, yMax, xScale, yScale }; + }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]); + + const { innerW, innerH, xMax, yMax, xScale, yScale } = layout; + + const subsample = (arr: TimeSeriesPoint[]) => { + if (arr.length === 0) return arr; + const stride = Math.max(1, Math.floor(arr.length / innerW)); + return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr; + }; + + // Pre-format axis ticks. + const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4); + const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4); + + const resolve = (fraction: number) => { + const t = fraction * xMax; + const items: HoverItem[] = []; + for (const s of series) { + const v = interpAt(s.data, t); + if (v === null || !Number.isFinite(v)) continue; + items.push({ color: s.color, label: s.name, value: yFmt(v) }); + } + if (items.length === 0) return null; + return { items, title: fmtSeconds(t) }; + }; + + if (series.every((s) => s.data.length === 0)) { + return ( +
No data
+ ); + } + + return ( + + {/* y-axis gridlines + labels */} + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {yFmt(v)} + + + ); + })} + + {/* Raw scatter underlay */} + {series + .filter((s) => s.rawData && s.rawData.length > 0) + .map((s, si) => + subsample(s.rawData!).map((d, i) => ( + + )), + )} + + {/* Lines */} + {series.map((s, si) => { + if (s.data.length === 0) return null; + const sampled = subsample(s.data); + const path = sampled + .map( + (d, i) => + `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`, + ) + .join(' '); + return ( + + ); + })} + + {/* X-axis */} + + {xTickVals.map((v, i) => { + const x = xScale(v); + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmtSeconds(v)} + + ); + })} + + time + + + {yAxisLabel && ( + + {yAxisLabel} + + )} + + {/* Legend */} + {(() => { + const chipY = H - 8; + const chipW = innerW / Math.max(1, series.length); + return series.map((s, i) => { + const x = PAD.left + i * chipW; + return ( + + + + {s.name} + + + ); + }); + })()} + + ); +} + +/** Stacked-area chart for token-source share over time. */ +export function StackedAreaChart({ + sourceSeries, + durationS, + width = 720, + height = 260, +}: { + sourceSeries: Record; + durationS: number; + width?: number; + height?: number; +}) { + const W = width; + const H = height; + const PAD = { top: 12, right: 16, bottom: 56, left: 60 }; + + const computed = useMemo(() => { + const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0); + if (entries.length === 0) return null; + const tValues = entries[0]![1].map((p) => p.t); + const cum: Record = {}; + for (const [name, arr] of entries) { + let acc = 0; + cum[name] = arr.map((p) => { + acc += p.value; + return acc; + }); + } + const shares: Record = {}; + for (const name of Object.keys(cum)) shares[name] = []; + for (let i = 0; i < tValues.length; i++) { + const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0); + for (const [name] of entries) { + shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0); + } + } + return { tValues, shares }; + }, [sourceSeries]); + + const colors: Record = { + local_compute: '#f97316', + local_cache_hit: '#3b82f6', + external_kv_transfer: '#22c55e', + miss: '#f97316', + }; + const labelFor: Record = { + local_compute: 'Prefill', + local_cache_hit: 'HBM Cache Hit', + external_kv_transfer: 'Offload Cache Hit', + miss: 'Miss', + }; + + if (!computed) { + return ( +
No data
+ ); + } + const { tValues, shares } = computed; + + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const xMax = Math.max(durationS, 1); + const xScale = (t: number) => PAD.left + (t / xMax) * innerW; + const yScale = (v: number) => PAD.top + (1 - v) * innerH; + + const stackOrder = Object.keys(shares); + const lower: number[] = Array.from({ length: tValues.length }, () => 0); + const layers = stackOrder.map((name) => { + const upper = shares[name]!.map((v, i) => lower[i]! + v); + const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]); + const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]); + const d = `${top + .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`) + .join(' ')} ${[...bottom] + .toReversed() + .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`) + .join(' ')} Z`; + const color = colors[name] ?? '#6b7280'; + for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!; + return { name, color, d }; + }); + + const resolve = (fraction: number) => { + const t = fraction * xMax; + // Find the closest tValue index. + let idx = 0; + let bestDist = Infinity; + for (let i = 0; i < tValues.length; i++) { + const d = Math.abs(tValues[i]! - t); + if (d < bestDist) { + bestDist = d; + idx = i; + } + } + const items: HoverItem[] = stackOrder.map((name) => ({ + color: colors[name] ?? '#6b7280', + label: labelFor[name] ?? name, + value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`, + })); + return { items, title: fmtSeconds(t) }; + }; + + const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4); + const yTickVals = [0, 0.25, 0.5, 0.75, 1]; + + return ( + + {yTickVals.map((v, i) => { + const y = yScale(v); + return ( + + + + {(v * 100).toFixed(0)}% + + + ); + })} + {layers.map((l, i) => ( + + ))} + + {xTickVals.map((v, i) => { + const x = xScale(v); + const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle'; + return ( + + {fmtSeconds(v)} + + ); + })} + + time + + + % of prefill tokens + + {(() => { + const chipY = H - 8; + const chipW = innerW / Math.max(1, layers.length); + return layers.map((l, i) => { + const x = PAD.left + i * chipW; + return ( + + + + {labelFor[l.name] ?? l.name} + + + ); + }); + })()} + + ); +} diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index beed5e0a..328750f0 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -1,7 +1,7 @@ import { useMemo, useRef } from 'react'; import { useQueries } from '@tanstack/react-query'; -import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; +import { rowToSequence } from '@semianalysisai/inferencex-constants'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { @@ -19,8 +19,8 @@ import { getModelSortIndex, hardwareKeyMatchesAnyBase, } from '@/lib/constants'; -import { transformBenchmarkRows } from '@/lib/benchmark-transform'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; +import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; /** Build deduplicated comparison dates, excluding the main run date. */ @@ -83,12 +83,22 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, + selectedPercentile = 'p90', /** When set, only series for these two registry GPU keys are shown (compare pages). */ compareGpuPair?: readonly [string, string] | null, + /** + * GitHub run id (g_runid) from the run picker. When set, the benchmarks API + * scopes results to that workflow run instead of returning the latest per + * config — disambiguates when two runs land on the same date. + */ + selectedRunId?: string, ) { // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the // materialized view instead of firing a redundant second fetch with identical data. + // When a specific run is selected, we always go through the runId branch and the + // date is effectively ignored — keep queryDate set so React Query still has a + // distinct cache key per date if the user navigates back to "latest". const queryDate = selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate ? '' @@ -98,7 +108,7 @@ export function useChartData( data: allRows, isLoading: queryLoading, error: queryError, - } = useBenchmarks(selectedModel, queryDate, enabled); + } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId); // GPU comparison: fetch data for each additional comparison date const comparisonDates = useMemo( @@ -125,11 +135,13 @@ export function useChartData( // Merge main rows with comparison date rows. // Stamp each row with the *requested* date (not the actual DB date) so that // GPUGraph's activeDates filter (keyed by user-selected date) matches the points. - const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]); + // + // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via + // benchmark_type), so one filter covers every scenario. const rows = useMemo(() => { - if (!allRows || !sequenceIslOsl) return []; - const seqFilter = (r: { isl: number; osl: number }) => - r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl; + if (!allRows) return []; + const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) => + rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); // For each (hw, framework, spec_method, disagg, precision) group, keep only @@ -156,14 +168,14 @@ export function useChartData( .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })), ); return [...mainRows, ...extraRows]; - }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]); + }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]); // Transform filtered rows into chart data const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => { if (rows.length === 0) return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig }; - return transformBenchmarkRows(rows); - }, [rows]); + return transformBenchmarkRows(rows, selectedPercentile); + }, [rows, selectedPercentile]); // Sort hardware config — stabilize reference when keys haven't changed. // Different sequences for the same model often have the same GPU configs, @@ -198,8 +210,11 @@ export function useChartData( (chartDefinitions as ChartDefinition[]).map((chartDef) => { const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; - // Determine dynamic x-axis - let xAxisField: keyof AggDataEntry = chartDef.x; + // Default x-axis = chart's natural latency metric, percentile-adjusted + // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic + // scenarios `withPercentile` is a no-op when percentile === 'median'. + const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry; + let xAxisField: keyof AggDataEntry = naturalX; let xAxisLabel = chartDef.x_label; const metricTitle = @@ -209,14 +224,25 @@ export function useChartData( // Resolve the effective x-axis override per chart type const effectiveXMetric = chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric; + // The TTFT override is now any *_ttft metric (not just p90_ttft) — the + // x-axis-mode picker reconciles the percentile prefix based on sequence + // kind (fixed-seq → median, agentic → user-picked percentile). const isTtftOverride = - effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft'; - const ttftLabel = - effectiveXMetric === 'p99_ttft' - ? 'P99 Time To First Token (s)' - : 'Median Time To First Token (s)'; - - if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) { + typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft'); + const ttftPctl = isTtftOverride + ? (effectiveXMetric as string).replace(/_ttft$/u, '') + : 'p90'; + const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase(); + const ttftLabel = `${ttftPctlWord} Time To First Token (s)`; + + const isAgentic = selectedSequence === Sequence.AgenticTraces; + + if ( + effectiveXMetric && + chartDef.chartType === 'interactivity' && + isInputMetric && + !isAgentic + ) { xAxisField = effectiveXMetric as keyof AggDataEntry; const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) { @@ -225,6 +251,10 @@ export function useChartData( xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label; } } else if (chartDef.chartType === 'interactivity' && isInputMetric) { + // Agentic falls through here too — the manual X-axis dropdown is + // hidden in agentic mode (would double up with the percentile + // selector), so the config default + percentile post-processing + // below drives the x axis. const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition; const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x; @@ -234,12 +264,35 @@ export function useChartData( xAxisLabel = ttftLabel; } + // Agentic: rewrite the resolved x metric to the chosen percentile, + // and relabel accordingly. Both have to be updated unconditionally — + // xAxisField may already be percentile-adjusted (via naturalX) while + // xAxisLabel still carries the raw chartDef.x_label prefix. + // The chart heading ("vs. ") is also rewritten to include + // the percentile so the title above the plot reflects what's drawn. + const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition; + let chartHeading = (chartDef[headingKey] as string) || chartDef.heading; + if (isAgentic) { + xAxisField = withPercentile( + xAxisField as string, + selectedPercentile, + ) as keyof AggDataEntry; + const pctlWord = selectedPercentile.toUpperCase(); + xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord); + chartHeading = chartHeading.replace( + /^(vs\.\s+)(?:(Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu, + `$1${pctlWord} `, + ); + } + // The x-axis is "flipped" only when the good-direction reverses // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), // so no roofline flip is needed for the e2e chart. + // Compare against `naturalX` (percentile-adjusted) — switching the + // percentile of the same logical metric is NOT a flip. const xAxisFlipped = - xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride); + xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride); const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition; const dynamicYLabel = chartDef[yLabelKey]; @@ -260,6 +313,7 @@ export function useChartData( chartDefinition: { ...chartDef, ...rooflineOverrides, + heading: chartHeading, x_label: xAxisLabel, y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel), }, @@ -267,7 +321,13 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric], + [ + selectedYAxisMetric, + selectedXAxisMetric, + selectedE2eXAxisMetric, + selectedPercentile, + selectedSequence, + ], ); // Build renderable graphs (data processing + stable chart definitions) @@ -299,7 +359,8 @@ export function useChartData( // Filter to points that have the selected metric, then remap x/y const hasMetric = filteredData.some((d) => metricKey in d); - const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft'; + const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft'); + const isAgentic = selectedSequence === Sequence.AgenticTraces; const processedData = hasMetric ? filteredData .filter((d) => metricKey in d) @@ -314,11 +375,14 @@ export function useChartData( roof, }; }) - // When TTFT is on the x-axis, apply the latency limit to filter overload outliers - // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left) + // When TTFT is on the x-axis, apply the latency limit to filter + // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that + // compress all real data to the far left). Skip for agentic — long + // TTFTs there reflect real workloads (multi-turn, big prompts). .filter( (d) => !isTtftX || + isAgentic || !chartDefinition.y_latency_limit || d.x <= chartDefinition.y_latency_limit, ) diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json index e26d237e..dcd91e60 100644 --- a/packages/app/src/components/inference/inference-chart-config.json +++ b/packages/app/src/components/inference/inference-chart-config.json @@ -13,9 +13,9 @@ "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)", "y_inputTputPerGpu_title": "Input Token Throughput per GPU", "y_inputTputPerGpu_roofline": "upper_left", - "y_inputTputPerGpu_x": "p99_ttft", - "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)", - "y_inputTputPerGpu_heading": "vs. P99 Time To First Token", + "y_inputTputPerGpu_x": "p90_ttft", + "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)", + "y_inputTputPerGpu_heading": "vs. P90 Time To First Token", "y_outputTputPerGpu": "outputTputPerGpu.y", "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)", "y_outputTputPerGpu_title": "Output Token Throughput per GPU", @@ -105,8 +105,8 @@ "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)", "y_inputTputPerGpu_title": "Input Token Throughput per GPU", "y_inputTputPerGpu_roofline": "upper_right", - "y_inputTputPerGpu_x": "p99_ttft", - "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)", + "y_inputTputPerGpu_x": "p90_ttft", + "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)", "y_outputTputPerGpu": "outputTputPerGpu.y", "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)", "y_outputTputPerGpu_title": "Output Token Throughput per GPU", diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts index be076418..b0eb1446 100644 --- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts +++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts @@ -82,8 +82,7 @@ function resolveXAxisField( const metricTitle = (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || ''; const isInputMetric = metricTitle.toLowerCase().includes('input'); - const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + const isTtftOverride = selectedXAxisMetric === 'p90_ttft'; if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { return selectedXAxisMetric; diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index 5b5f9ec2..bedded40 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -36,6 +36,8 @@ import type { Model, Sequence } from '@/lib/data-mappings'; * @property {number} p99_e2el - 99th percentile of End-to-End Latency. */ export interface AggDataEntry { + /** Stable per-point id from benchmark_results — for trace_replay lookups. */ + id?: number; hw: string; mtp?: string; hwKey: string; @@ -50,23 +52,43 @@ export interface AggDataEntry { mean_ttft: number; median_ttft: number; std_ttft: number; + p75_ttft: number; + p90_ttft: number; + p95_ttft: number; p99_ttft: number; + 'p99.9_ttft': number; mean_tpot: number; mean_intvty: number; median_tpot: number; median_intvty: number; std_tpot: number; std_intvty: number; + p75_tpot: number; + p75_intvty: number; + p90_tpot: number; + p90_intvty: number; + p95_tpot: number; + p95_intvty: number; p99_tpot: number; p99_intvty: number; + 'p99.9_tpot': number; + 'p99.9_intvty': number; mean_itl: number; median_itl: number; std_itl: number; + p75_itl: number; + p90_itl: number; + p95_itl: number; p99_itl: number; + 'p99.9_itl': number; mean_e2el: number; median_e2el: number; std_e2el: number; + p75_e2el: number; + p90_e2el: number; + p95_e2el: number; p99_e2el: number; + 'p99.9_e2el': number; disagg: boolean; num_prefill_gpu: number; num_decode_gpu: number; @@ -88,6 +110,29 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */ + benchmark_type?: string; + /** ISL in tokens — null for agentic_traces. */ + isl?: number | null; + /** OSL in tokens — null for agentic_traces. */ + osl?: number | null; + // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ── + /** "on" | "off" — whether KV cache offload to CPU was enabled. */ + offload_mode?: string; + /** Actual server-observed GPU prefix-cache hit rate (0..1). */ + server_gpu_cache_hit_rate?: number; + /** Actual server-observed CPU prefix-cache hit rate (0..1). */ + server_cpu_cache_hit_rate?: number; + /** Infinite-cache theoretical hit rate (0..1) computed from trace. */ + theoretical_cache_hit_rate?: number; + /** Total requests attempted during the window. */ + num_requests_total?: number; + /** Requests that completed successfully. */ + num_requests_successful?: number; + /** Total prompt tokens served. */ + total_prompt_tokens?: number; + /** Total generated (output) tokens. */ + total_generation_tokens?: number; } /** @@ -490,10 +535,26 @@ export interface InferenceChartContextType { workflowInfo: any; selectedYAxisMetric: string; setSelectedYAxisMetric: (metric: string) => void; + /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */ + selectedPercentile: string; + setSelectedPercentile: (p: string) => void; selectedXAxisMetric: string | null; setSelectedXAxisMetric: (metric: string | null) => void; selectedE2eXAxisMetric: string | null; setSelectedE2eXAxisMetric: (metric: string | null) => void; + /** + * Which chart variant the user wants to see — the inference card shows one chart + * at a time, picked by the big buttons above the chart. + * - 'ttft' → e2e chartType with x-axis forced to p90_ttft + * - 'e2e' → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el) + * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty) + * - 'session-time' → agentic-only; x = mean-normalized session time (live-computed from trace blobs) + * - 'prefill-tps' → agentic-only; x = mean of P90 prefill TPS/user per session + */ + selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; + setSelectedXAxisMode: ( + mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps', + ) => void; scaleType: 'auto' | 'linear' | 'log'; setScaleType: (type: 'auto' | 'linear' | 'log') => void; setIsLegendExpanded: (metric: boolean) => void; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 0b1705b0..ad222edc 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -1,13 +1,14 @@ 'use client'; -import { useState } from 'react'; +import { useEffect, useState } from 'react'; import { track } from '@/lib/analytics'; import { useInference } from '@/components/inference/InferenceContext'; import { ModelSelector, - SequenceSelector, + ScenarioSelector, + PercentileSelector, PrecisionSelector, } from '@/components/ui/chart-selectors'; import { DateRangePicker } from '@/components/ui/date-range-picker'; @@ -24,7 +25,7 @@ import { SearchableSelect } from '@/components/ui/searchable-select'; import { TooltipProvider } from '@/components/ui/tooltip'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { ChartDefinition } from '@/components/inference/types'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model, type Percentile } from '@/lib/data-mappings'; // Build Y-axis metric options from static chart config JSON — available immediately, no API wait const METRIC_GROUPS = [ @@ -79,6 +80,13 @@ interface ChartControlsProps { } export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) { + // The percentile selector is rendered conditionally on `selectedSequence`, + // which on the client is hydrated from URL params. SSR doesn't see the URL, + // so deferring the conditional until after mount keeps the initial DOM + // identical between server and client (avoids hydration warnings). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const [openDropdown, setOpenDropdown] = useState(null); const handleDropdownOpenChange = (dropdownKey: string) => (open: boolean) => { if (open) { @@ -87,6 +95,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro } setOpenDropdown((current) => (current === dropdownKey ? null : current)); }; + const { selectedModel, setSelectedModel, @@ -96,6 +105,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro setSelectedPrecisions, selectedYAxisMetric, setSelectedYAxisMetric, + selectedPercentile, + setSelectedPercentile, graphs, selectedGPUs, setSelectedGPUs, @@ -214,14 +225,21 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro availableModels={availableModels} data-testid="model-selector" /> - + {mounted && selectedSequence === Sequence.AgenticTraces && ( + setSelectedPercentile(p)} + data-testid="percentile-selector" + /> + )} {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') && - isInputMetric && ( + isInputMetric && + selectedSequence !== Sequence.AgenticTraces && (
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index f0e1692a..fd6cd9c1 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -1,8 +1,8 @@ 'use client'; import { track } from '@/lib/analytics'; import dynamic from 'next/dynamic'; -import { useMemo, useRef, useState } from 'react'; -import { BarChart3, ChevronDown, Table2, X } from 'lucide-react'; +import { useEffect, useMemo, useRef, useState } from 'react'; +import { BarChart3, Table2, X } from 'lucide-react'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import { useInference } from '@/components/inference/InferenceContext'; @@ -30,7 +30,6 @@ import { DialogHeader, DialogTitle, } from '@/components/ui/dialog'; -import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; import { Skeleton } from '@/components/ui/skeleton'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { @@ -40,8 +39,10 @@ import { getModelLabel, getPrecisionLabel, getSequenceLabel, + sequenceKind, } from '@/lib/data-mappings'; import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs'; +import { useDerivedAgenticMetrics } from '@/hooks/api/use-derived-agentic-metrics'; import { useTrendData } from '@/components/inference/hooks/useTrendData'; import { hardwareKeyMatchesAnyBase } from '@/lib/constants'; @@ -59,54 +60,30 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra }); import WorkflowInfoDisplay from './WorkflowInfoDisplay'; -/** Controlled popover dropdown for the e2e chart x-axis toggle. */ -function E2eXAxisDropdown({ - xAxisLabel, - xAxisOptions, - selectedValue, - onSelect, -}: { - xAxisLabel: string; - xAxisOptions: { value: string | null; label: string }[]; - selectedValue: string | null; - onSelect: (value: string | null) => void; -}) { - const [open, setOpen] = useState(false); - return ( - - - - - - {xAxisOptions.map((opt) => ( - - ))} - - - ); -} - type InferenceViewMode = 'chart' | 'table'; +/** + * The chart variants the user can choose with the big buttons above the chart + * card. The first three map to entries in `inference-chart-config.json` plus a + * forced x-axis override for the E2E chartType; the last two are agentic-only + * derived metrics computed live from the stored trace_replay blobs. + */ +type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps'; + +interface XAxisModeButton { + value: XAxisMode; + label: string; + /** When true, the button is only shown on agentic scenarios. */ + agenticOnly?: boolean; +} +const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [ + { value: 'ttft', label: 'TTFT' }, + { value: 'e2e', label: 'E2E Latency' }, + { value: 'interactivity', label: 'Interactivity' }, + { value: 'session-time', label: 'Session Time', agenticOnly: true }, + { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true }, +]; + const VIEW_MODE_OPTIONS: SegmentedToggleOption[] = [ { value: 'chart', @@ -151,8 +128,10 @@ export default function ChartDisplay() { logScale, activeHwTypes, activeDates, - setSelectedE2eXAxisMetric, + selectedPercentile, compareGpuPair, + selectedXAxisMode, + setSelectedXAxisMode, } = useInference(); const { @@ -161,6 +140,13 @@ export default function ChartDisplay() { totalDatesQueried, } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates); + // SSR has no URL access and `selectedSequence` defaults to agentic on the + // server even when the URL says fixed-seq — so any conditional rendering + // that keys off `sequenceKind(selectedSequence)` would diverge between + // server and client first render. Defer agentic-only UI until after mount. + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const [viewModes, setViewModes] = useState>({}); const replayHandlesRef = useRef>({}); const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart'; @@ -210,6 +196,7 @@ export default function ChartDisplay() { chartType, selectedYAxisMetric, effectiveXMetric, + { isAgentic: sequenceKind(selectedSequence) === 'agentic' }, ); let overlayPoints = processed; @@ -327,214 +314,258 @@ export default function ChartDisplay() { })); }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]); - const displayGraphs = isFirstLoad - ? Array.from({ length: 2 }).map((_, index) => ( - - - - - - )) - : effectiveGraphs.length === 0 - ? [] - : effectiveGraphs.map((graph, graphIndex) => { - const isTimelineMode = Boolean( - selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, - ); - const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; - return ( -
-
- handleViewModeChange(graphIndex, v)} - ariaLabel="View mode" - testId={`inference-view-toggle-${graphIndex}`} - /> - } - hideImageExport={getViewMode(graphIndex) === 'table'} - setIsLegendExpanded={setIsLegendExpanded} - exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} - onExportMp4={ - replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined - } - onExportCsv={() => { - const visibleData = graph.data.filter((d) => + // Show one chart at a time, picked by the buttons above the chart. + // - 'interactivity' renders the interactivity chartType. + // - 'ttft' / 'e2e' render the e2e chartType (x swap via selectedE2eXAxisMetric). + // - 'session-time' / 'prefill-tps' render the e2e chartType too; the x-axis + // is overridden below from live-computed derived metrics. + const visibleGraphs = useMemo(() => { + const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e'; + const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType); + return filtered.length > 0 ? filtered : effectiveGraphs; + }, [effectiveGraphs, selectedXAxisMode]); + + // Derived-metric path: fetch live-computed values from the trace_replay blobs + // and override scatter data.x. Only fires for the two agentic-only modes. + const useDerived = + sequenceKind(selectedSequence) === 'agentic' && + (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps'); + const derivedTargetIds = useMemo(() => { + if (!useDerived) return [] as number[]; + const ids = new Set(); + for (const g of visibleGraphs) { + for (const d of g.data) { + if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') { + ids.add(d.id); + } + } + } + return [...ids]; + }, [useDerived, visibleGraphs]); + const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived); + const derivedMetrics = derivedQuery.data; + // Show skeleton (not "No data available") while the derived-metrics query + // is in flight. Without this gate, every flip to session-time / prefill-tps + // briefly blanks the chart and surfaces a misleading empty-state. + const isDerivedLoading = + useDerived && + derivedTargetIds.length > 0 && + (derivedQuery.isPending || derivedQuery.isFetching) && + !derivedMetrics; + + const renderableGraphs = useMemo(() => { + if (!useDerived) return visibleGraphs; + if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] })); + const isSession = selectedXAxisMode === 'session-time'; + const xLabel = isSession + ? 'Mean Normalized Session Time (min)' + : 'P90 Prefill TPS per user (tok/s)'; + // Roofline corner = which corner the curve sweeps from / toward, matching + // existing chart-config convention: + // - session-time: as concurrency rises, session time AND throughput both + // grow → curve goes bottom-left → top-right → upper_right. + // - prefill-tps: as concurrency rises, per-user prefill TPS falls while + // total throughput rises → curve goes top-left → bottom-right → + // upper_left. + const rooflineCorner = isSession ? 'upper_right' : 'upper_left'; + return visibleGraphs.map((g) => { + const overriddenChartDef = { + ...g.chartDefinition, + x_label: xLabel, + // y_latency_limit was meant to suppress fixed-seq overload outliers on + // the TTFT axis — irrelevant for these derived axes. + y_latency_limit: undefined, + [`${selectedYAxisMetric}_roofline` as keyof typeof g.chartDefinition]: rooflineCorner, + }; + const data = g.data + .map((d) => { + if (typeof d.id !== 'number') return null; + const m = derivedMetrics[d.id]; + const raw = isSession ? m?.normalized_session_time_s : m?.p90_prefill_tps_per_user; + if (raw === null || raw === undefined || !Number.isFinite(raw)) return null; + const v = isSession ? raw / 60 : raw; + return { ...d, x: v }; + }) + .filter((d): d is NonNullable => d !== null); + return { ...g, chartDefinition: overriddenChartDef, data }; + }); + }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]); + + const displayGraphs = + isFirstLoad || isDerivedLoading + ? [ + + + + + , + ] + : renderableGraphs.length === 0 + ? [] + : renderableGraphs.map((graph, graphIndex) => { + const isTimelineMode = Boolean( + selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, + ); + const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; + return ( +
+
+ - - {(() => { - const chartCaption = ( - <> -

- { - graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] - }{' '} - {(() => { - // For Input metrics with dynamic x-axis, use dynamic heading - const metricTitle = - (graph.chartDefinition[ + ? 'gpu_timeseries' + : graph.chartDefinition.chartType === 'e2e' + ? 'latency' + : 'interactivity' + } + leadingControls={ + handleViewModeChange(graphIndex, v)} + ariaLabel="View mode" + testId={`inference-view-toggle-${graphIndex}`} + /> + } + hideImageExport={getViewMode(graphIndex) === 'table'} + setIsLegendExpanded={setIsLegendExpanded} + exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} + onExportMp4={ + replayAvailable + ? () => replayHandlesRef.current[graphIndex]?.open() + : undefined + } + onExportCsv={() => { + const visibleData = graph.data.filter((d) => + isTimelineMode + ? activeDates.has(`${d.date}_${d.hwKey}`) + : activeHwTypes.has(d.hwKey as string) && + selectedPrecisions.includes(d.precision), + ); + const { headers, rows } = inferenceChartToCsv( + visibleData, + graph.model, + graph.sequence, + ); + exportToCsv( + `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, + headers, + rows, + ); + }} + /> + + {(() => { + const chartCaption = ( + <> +

+ { + graph.chartDefinition[ `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] as string) || ''; - const isInputMetric = metricTitle.toLowerCase().includes('input'); - if ( - graph.chartDefinition.chartType === 'interactivity' && - isInputMetric && - selectedXAxisMetric - ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; + ] + }{' '} + {(() => { + // For Input metrics with dynamic x-axis, use dynamic heading + const metricTitle = + (graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] as string) || ''; + const isInputMetric = metricTitle.toLowerCase().includes('input'); + if ( + graph.chartDefinition.chartType === 'interactivity' && + isInputMetric && + selectedXAxisMetric === 'p90_ttft' + ) { + return 'vs. P90 Time To First Token'; } - } - // For e2e chart: render clickable inline dropdown for x-axis - if (graph.chartDefinition.chartType === 'e2e') { - const xAxisLabel = - selectedE2eXAxisMetric === 'p99_ttft' - ? 'P99 TTFT' - : selectedE2eXAxisMetric === 'median_ttft' - ? 'Median TTFT' - : 'End-to-end Latency'; - const xAxisOptions = [ - { value: null, label: 'End-to-end Latency' }, - { value: 'p99_ttft', label: 'P99 TTFT' }, - { value: 'median_ttft', label: 'Median TTFT' }, - ]; - const zoomPrefix = - selectedDateRange.startDate && - selectedDateRange.endDate && - selectedGPUs.length > 0 - ? 'gpu_timeseries' - : 'latency'; - return ( - { - setSelectedE2eXAxisMetric(value); - track('latency_x_axis_metric_selected', { - metric: value ?? 'median_e2el', - }); - window.dispatchEvent( - new CustomEvent( - `${zoomPrefix}_zoom_reset_chart-${graphIndex}`, - ), - ); - }} - /> - ); - } + // For e2e chart: heading is driven by the buttons above the + // card. Derived-metric modes win first; otherwise the metric + // carries the percentile prefix (e.g. p90_ttft, median_ttft). + if (graph.chartDefinition.chartType === 'e2e') { + if (selectedXAxisMode === 'session-time') { + return 'vs. Mean Normalized Session Time'; + } + if (selectedXAxisMode === 'prefill-tps') { + return 'vs. P90 Prefill TPS / user'; + } + const isAgentic = sequenceKind(selectedSequence) === 'agentic'; + if (selectedE2eXAxisMetric?.endsWith('_ttft')) { + const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); + const word = pctl === 'median' ? 'Median' : pctl.toUpperCase(); + return `vs. ${word} Time To First Token`; + } + const pctlWord = selectedPercentile.toUpperCase(); + return isAgentic + ? `vs. ${pctlWord} End-to-end Latency` + : 'vs. End-to-end Latency'; + } - // Fall back to configured heading - return ( - graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading - ); - })()} -

-

- {getModelLabel(graph.model as Model)} •{' '} - {selectedPrecisions - .map((prec) => getPrecisionLabel(prec as Precision)) - .join(', ')}{' '} - • {getSequenceLabel(graph.sequence as Sequence)} •{' '} - {isUnofficialRun - ? 'Source: UNOFFICIAL' - : 'Source: SemiAnalysis InferenceX™'} - {selectedRunDate && ( - <> - {' '} - • Updated:{' '} - {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( - 'en-US', - { - year: 'numeric', - month: '2-digit', - day: '2-digit', - timeZone: 'UTC', - }, - )} - - )} -

- - - - ); - - if (getViewMode(graphIndex) === 'table') { - const overlay = - graph.chartDefinition.chartType === 'e2e' - ? overlayDataByChartType.e2e - : overlayDataByChartType.interactivity; - const overlayRows = (overlay?.data ?? []).filter((p) => - selectedPrecisions.includes(p.precision), - ); - return ( - <> - {chartCaption} - 0 ? [...graph.data, ...overlayRows] : graph.data - } - chartDefinition={graph.chartDefinition} - selectedYAxisMetric={selectedYAxisMetric} - /> + // Fall back to the heading baked into chartDefinition + // by useChartData (already resolves per-metric overrides + // and applies the agentic percentile rewrite). + return graph.chartDefinition.heading; + })()} +

+

+ {getModelLabel(graph.model as Model)} •{' '} + {selectedPrecisions + .map((prec) => getPrecisionLabel(prec as Precision)) + .join(', ')}{' '} + • {getSequenceLabel(graph.sequence as Sequence)} •{' '} + {isUnofficialRun + ? 'Source: UNOFFICIAL' + : 'Source: SemiAnalysis InferenceX™'} + {selectedRunDate && ( + <> + {' '} + • Updated:{' '} + {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( + 'en-US', + { + year: 'numeric', + month: '2-digit', + day: '2-digit', + timeZone: 'UTC', + }, + )} + + )} +

+ + ); - } - return selectedDateRange.startDate && - selectedDateRange.endDate && - selectedGPUs.length > 0 ? ( - - ) : ( -
- + selectedPrecisions.includes(p.precision), + ); + return ( + <> + {chartCaption} + 0 + ? [...graph.data, ...overlayRows] + : graph.data + } + chartDefinition={graph.chartDefinition} + selectedYAxisMetric={selectedYAxisMetric} + /> + + ); + } + + return selectedDateRange.startDate && + selectedDateRange.endDate && + selectedGPUs.length > 0 ? ( + - {selectedGPUs.length > 0 && - (!selectedDateRange.startDate || !selectedDateRange.endDate) && ( -
-

- Select a date range to view GPU comparison -

-
- )} -
- ); - })()} - {replayAvailable && ( - { - replayHandlesRef.current[graphIndex] = handle; - }} - parentChartId={`chart-${graphIndex}`} - chartDefinition={graph.chartDefinition} - yLabel={`${ - graph.chartDefinition[ - `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition - ] - }`} - xLabel={graph.chartDefinition.x_label} - /> - )} -
-
-
- ); - }); + ) : ( +
+ + {selectedGPUs.length > 0 && + (!selectedDateRange.startDate || !selectedDateRange.endDate) && ( +
+

+ Select a date range to view GPU comparison +

+
+ )} +
+ ); + })()} + {replayAvailable && ( + { + replayHandlesRef.current[graphIndex] = handle; + }} + parentChartId={`chart-${graphIndex}`} + chartDefinition={graph.chartDefinition} + yLabel={`${ + graph.chartDefinition[ + `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition + ] + }`} + xLabel={graph.chartDefinition.x_label} + /> + )} + +
+
+ ); + }); return (
@@ -640,6 +686,43 @@ export default function ChartDisplay() { )} +
+ {X_AXIS_MODE_BUTTONS.filter(({ agenticOnly }) => { + if (!agenticOnly) return true; + // Before client mount, conditionalize on the server-default kind + // (agentic) so SSR + first client render produce identical DOM. After + // mount, hide the agentic-only buttons on fixed-seq sequences. + if (!mounted) return true; + return sequenceKind(selectedSequence) === 'agentic'; + }).map(({ value, label }) => { + const isActive = selectedXAxisMode === value; + return ( + + ); + })} +
{displayGraphs}
{/* Performance Over Time — Modal Drill-Down */} diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index f9a73aa8..fdcf8952 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -6,6 +6,8 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; +import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; +import { useRouter } from 'next/navigation'; import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { computeToggle } from '@/hooks/useTogglableSet'; @@ -63,6 +65,96 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; +// Greedy label-collision avoidance. +// Each candidate is the y-position of the FIRST baseline (relative to point +// center) which we apply via the first tspan's `dy` — later tspans cascade +// down by 1.1em. We try above/below at primary and secondary offsets, and +// hide the label if all four positions collide. +function avoidLabelCollisions( + zoomGroup: d3.Selection, +): void { + interface LabelInfo { + el: SVGTextElement; + firstTspan: SVGTSpanElement; + cx: number; + cy: number; + w: number; + nLines: number; + defaultFirstY: number; + } + const labels: LabelInfo[] = []; + const ASCENT = 9; + const DESCENT = 3; + const LINE_H = 11; + + zoomGroup.selectAll('.dot-group').each(function () { + const labelEl = this.querySelector('.point-label'); + if (!labelEl) return; + if ((this as SVGGElement).style.opacity === '0') return; + const tspans = labelEl.querySelectorAll('tspan'); + if (tspans.length === 0) return; + const transform = (this as SVGGElement).getAttribute('transform') ?? ''; + const m = transform.match(/translate\(([^,]+),([^)]+)\)/); + if (!m) return; + const cx = parseFloat(m[1]); + const cy = parseFloat(m[2]); + const nLines = tspans.length; + const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point + // Reset to default before measuring so prior positioning doesn't bias bbox + tspans[0].setAttribute('dy', `${defaultFirstY}px`); + labelEl.style.opacity = '1'; + const bbox = labelEl.getBBox(); + labels.push({ + el: labelEl, + firstTspan: tspans[0], + cx, + cy, + w: bbox.width, + nLines, + defaultFirstY, + }); + }); + + labels.sort((a, b) => a.cx - b.cx); + const placed: { left: number; right: number; top: number; bottom: number }[] = []; + const pad = 2; + + for (const lab of labels) { + const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT; + const aboveFirstY = lab.defaultFirstY; + const belowFirstY = 14; // first baseline 14px below point center + const candidates = [ + aboveFirstY, + belowFirstY, + aboveFirstY - blockH - 2, + belowFirstY + blockH + 2, + ]; + let chosenY: number | null = null; + let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; + for (const firstY of candidates) { + const top = lab.cy + firstY - ASCENT - pad; + const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad; + const left = lab.cx - lab.w / 2 - pad; + const right = lab.cx + lab.w / 2 + pad; + const collides = placed.some( + (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), + ); + if (!collides) { + chosenY = firstY; + chosenBox = { left, right, top, bottom }; + break; + } + } + if (chosenY !== null && chosenBox) { + lab.firstTspan.setAttribute('dy', `${chosenY}px`); + lab.el.style.opacity = '1'; + placed.push(chosenBox); + } else { + lab.el.style.opacity = '0'; + } + } +} + // X-shape path for overlay (unofficial) data points const X_SIZE = 5; const X_HOVER_SIZE = 7; @@ -258,6 +350,10 @@ const ScatterGraph = React.memo( ); const rooflines = useMemo(() => { + // Frontier scope is (hw, precision, date) — points from different dates + // can never share a frontier (a May 15 point can't dominate a May 17 plot). + // The legend grouping is still by (hw, precision); we just split the + // pareto compute per date and re-merge into the legend bucket. const result: Record = {}; const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition; const dir = chartDefinition[rooflineKey] as @@ -266,17 +362,31 @@ const ScatterGraph = React.memo( | 'lower_left' | 'lower_right' | undefined; - for (const hw of Object.keys(groupedData)) { - const front = - dir === 'upper_right' - ? paretoFrontUpperRight(groupedData[hw]) - : dir === 'upper_left' - ? paretoFrontUpperLeft(groupedData[hw]) - : dir === 'lower_left' - ? paretoFrontLowerLeft(groupedData[hw]) - : paretoFrontLowerRight(groupedData[hw]); - front.sort((a, b) => a.x - b.x); - result[hw] = front; + const frontierFn = + dir === 'upper_right' + ? paretoFrontUpperRight + : dir === 'upper_left' + ? paretoFrontUpperLeft + : dir === 'lower_left' + ? paretoFrontLowerLeft + : paretoFrontLowerRight; + for (const hwKey of Object.keys(groupedData)) { + const byDate = new Map(); + for (const p of groupedData[hwKey]) { + const d = p.date; + let bucket = byDate.get(d); + if (!bucket) { + bucket = []; + byDate.set(d, bucket); + } + bucket.push(p); + } + const combined: InferenceData[] = []; + for (const datePoints of byDate.values()) { + combined.push(...frontierFn(datePoints)); + } + combined.sort((a, b) => a.x - b.x); + result[hwKey] = combined; } return result; }, [groupedData, selectedYAxisMetric, chartDefinition]); @@ -284,7 +394,7 @@ const ScatterGraph = React.memo( const optimalPointKeys = useMemo(() => { const keys = new Set(); Object.values(rooflines).forEach((pts) => - pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)), + pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}_${p.date}-${p.x}-${p.y}`)), ); return keys; }, [rooflines]); @@ -311,6 +421,10 @@ const ScatterGraph = React.memo( const buildPointConfigId = useCallback((point: InferenceData): string => { let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`; if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`; + // Agentic runs emit two rows per (config, conc) — one offload=on, one off. + // Without this suffix, d3's data join treats them as the same point and + // drops one variant (along with its halo). + if (point.offload_mode) key += `|offload-${point.offload_mode}`; return key; }, []); @@ -383,6 +497,18 @@ const ScatterGraph = React.memo( // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); + // Trace-replay histograms (ISL / OSL distributions) for agentic points. + // Pre-fetch the whole visible set so tooltip render stays synchronous. + const agenticIds = useMemo(() => { + const ids: number[] = []; + for (const p of pointsData) { + if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id); + } + return ids; + }, [pointsData]); + const { data: traceHistograms } = useTraceHistograms(agenticIds); + const router = useRouter(); + // Gradient label data const allPointLabelsByKey = useMemo(() => { const globalLabelColorMap = new Map(); @@ -422,7 +548,9 @@ const ScatterGraph = React.memo( const visiblePoints = useMemo(() => { let pts = filteredData; if (hideNonOptimal) { - pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)); + pts = pts.filter((d) => + optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`), + ); } return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts; }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]); @@ -507,7 +635,8 @@ const ScatterGraph = React.memo( (d: InferenceData) => effectiveActiveHwTypes.has(d.hwKey as string) && selectedPrecisions.includes(d.precision) && - (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)), + (!hideNonOptimal || + optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`)), [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys], ); @@ -625,6 +754,7 @@ const ScatterGraph = React.memo( d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any, ); } + avoidLabelCollisions(ctx.layout.zoomGroup); }, }), [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type], @@ -644,6 +774,8 @@ const ScatterGraph = React.memo( hardwareConfig, isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)), runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, + traceHistogram: + typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined, }), getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x), getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y), @@ -659,26 +791,43 @@ const ScatterGraph = React.memo( ), onPointClick: (d: InferenceData) => { track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y }); - // Attach track-over-time button handler in the tooltip const tooltipEl = chartRef.current?.getTooltipElement(); - if (tooltipEl) { - const btn = tooltipEl.querySelector('[data-action="track-over-time"]'); - if (btn) { - btn.addEventListener('click', (btnEvent) => { - btnEvent.stopPropagation(); - const configId = buildPointConfigId(d); - if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId); - else addTrackedConfig(d, chartDefinition.chartType); - chartRef.current?.dismissTooltip(); - chartRef.current?.hideTooltip(); - track('latency_point_tracked_via_tooltip', { - hwKey: String(d.hwKey), - tp: d.tp, - conc: d.conc, - precision: d.precision, - }); + if (!tooltipEl) return; + + // ── Summary-page actions ────────────────────────────────────────── + const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]'); + if (trackBtn) { + trackBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + const configId = buildPointConfigId(d); + if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId); + else addTrackedConfig(d, chartDefinition.chartType); + chartRef.current?.dismissTooltip(); + chartRef.current?.hideTooltip(); + track('latency_point_tracked_via_tooltip', { + hwKey: String(d.hwKey), + tp: d.tp, + conc: d.conc, + precision: d.precision, }); - } + }); + } + + // ── "View charts" → navigate to dedicated detail page ──────────── + const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]'); + if (viewBtn && typeof d.id === 'number') { + const pointId = d.id; + viewBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + track('latency_view_charts_opened', { + id: pointId, + hwKey: String(d.hwKey), + conc: d.conc, + }); + chartRef.current?.dismissTooltip(); + chartRef.current?.hideTooltip(); + router.push(`/inference/agentic/${pointId}`); + }); } }, attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0) @@ -693,6 +842,11 @@ const ScatterGraph = React.memo( removeTrackedConfig, chartDefinition.chartType, selectedPrecisions, + // Tooltip content closure reads traceHistograms to decide whether to + // show the "View charts" button — rebuild config when the histogram + // fetch resolves so the button appears for points that have data. + traceHistograms, + router, ], ); @@ -743,35 +897,64 @@ const ScatterGraph = React.memo( const precision = key.split('_').pop()!; const visible = effectiveActiveHwTypes.has(hw) && selectedPrecisions.includes(precision); - let stroke = getCssColor(resolveColor(hw)); - - if (showGradientLabels) { - const pointLabels = allPointLabelsByKey[key]; - if (pointLabels) { - const stops = computeGradientStops(pointLabels, xScale); - if (stops) { - const gid = `roofline-gradient-${chartId}-${key}`; - activeGradientIds.add(gid); - let gradient = defs.select(`#${CSS.escape(gid)}`); - if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid); - gradient - .attr('gradientUnits', 'userSpaceOnUse') - .attr('x1', xScale(pts[0].x)) - .attr('y1', 0) - .attr('x2', xScale(pts.at(-1)!.x)) - .attr('y2', 0); - gradient - .selectAll('stop') - .data(stops) - .join('stop') - .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`) - .attr('stop-color', (s) => s.color); - stroke = `url(#${gid})`; - } + const baseStroke = getCssColor(resolveColor(hw)); + + // Split into per-date sub-paths so the line never crosses dates. + // (When only one date is present the loop runs once with the full set.) + const byDate = new Map(); + for (const p of pts) { + let bucket = byDate.get(p.date); + if (!bucket) { + bucket = []; + byDate.set(p.date, bucket); } + bucket.push(p); } + const singleDate = byDate.size === 1; + + for (const [date, datePoints] of byDate) { + if (datePoints.length <= 1) continue; + const entryKey = singleDate ? key : `${key}__${date}`; + let stroke = baseStroke; + + // Gradient labels only apply in the single-date case; mapping the + // (key-wide) ParetoPointLabel array onto per-date sub-segments is + // ambiguous and the comparison-date overlay is a rare combo. + if (singleDate && showGradientLabels) { + const pointLabels = allPointLabelsByKey[key]; + if (pointLabels) { + const stops = computeGradientStops(pointLabels, xScale); + if (stops) { + const gid = `roofline-gradient-${chartId}-${entryKey}`; + activeGradientIds.add(gid); + let gradient = defs.select(`#${CSS.escape(gid)}`); + if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid); + gradient + .attr('gradientUnits', 'userSpaceOnUse') + .attr('x1', xScale(datePoints[0].x)) + .attr('y1', 0) + .attr('x2', xScale(datePoints.at(-1)!.x)) + .attr('y2', 0); + gradient + .selectAll('stop') + .data(stops) + .join('stop') + .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`) + .attr('stop-color', (s) => s.color); + stroke = `url(#${gid})`; + } + } + } - entries.push({ key, hw, precision, points: pts, stroke, visible }); + entries.push({ + key: entryKey, + hw, + precision, + points: datePoints, + stroke, + visible, + }); + } }); // Remove stale gradients @@ -1176,11 +1359,26 @@ const ScatterGraph = React.memo( .y((d) => newYScale(d.y)) .curve(d3.curveMonotoneX); - // Update roofline paths + // Update roofline paths — must split per-date so the zoom redraw + // matches the per-date sub-paths created in the initial render. Object.entries(rooflines).forEach(([key, pts]) => { if (pts.length < 2) return; - const sel = zoomGroup.select(`.roofline-${key}`); - if (!sel.empty()) sel.attr('d', lineGen(pts) as string); + const byDate = new Map(); + for (const p of pts) { + let bucket = byDate.get(p.date); + if (!bucket) { + bucket = []; + byDate.set(p.date, bucket); + } + bucket.push(p); + } + const singleDate = byDate.size === 1; + for (const [date, datePoints] of byDate) { + if (datePoints.length < 2) continue; + const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`; + const sel = zoomGroup.select(`.${CSS.escape(cls)}`); + if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string); + } }); // Update gradient coordinates @@ -1406,7 +1604,8 @@ const ScatterGraph = React.memo( getOpacity: (d) => (isPointVisible(d) ? 1 : 0), getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1506,17 +1705,31 @@ const ScatterGraph = React.memo( // Labels const showLabels = !hidePointLabels && !showGradientLabels; overlayPoints.each(function (d) { - d3.select(this) + const lines = showLabels + ? (useAdvancedLabels + ? `${getPointLabel(d)}\nC=${d.conc}` + : `${d.tp}\nC=${d.conc}` + ).split('\n') + : []; + const text = d3 + .select(this) .selectAll('.overlay-label') .data(showLabels ? [true] : []) .join('text') .attr('class', 'overlay-label') - .attr('dy', -10) .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') - .attr('pointer-events', 'none') - .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp)); + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + const firstDy = -(1 + (lines.length - 1) * 1.1); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) + .text((l) => l); }); // Overlay tooltip handlers @@ -1784,6 +1997,23 @@ const ScatterGraph = React.memo( .attr('pointer-events', 'none'); }); + // Offload halo: dashed ring on every point that used KV offload (Pareto or not) + zoomGroup.selectAll('.dot-group').each(function (d) { + const showHalo = d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); + // Double-click to track/untrack zoomGroup .selectAll('.dot-group') @@ -1817,6 +2047,8 @@ const ScatterGraph = React.memo( }); }); + avoidLabelCollisions(zoomGroup); + // Log tick formatting on initial render if (xScaleConfig._isLog) { const xScale = ctx.xScale as d3.ScaleLogarithmic; @@ -1839,6 +2071,9 @@ const ScatterGraph = React.memo( chartDefinition.chartType, xScaleConfig._isLog, yScaleConfig.type, + optimalPointKeys, + getCssColor, + resolveColor, ], ); diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx index f9b1b3c8..73018483 100644 --- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx @@ -194,9 +194,7 @@ export function UnofficialChartDisplay() { `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition ] }{' '} - {graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading} + {graph.chartDefinition.heading}

{graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence} diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 8f8705e1..589ba580 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -157,12 +157,12 @@ describe('processOverlayChartData', () => { }); it('remaps x to config override for input metrics on interactivity chart', () => { - // inputTputPerGpu has x override to p99_ttft on interactivity chart + // inputTputPerGpu has x override to p90_ttft on interactivity chart const data = [ pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_intvty: 50, } as any), ]; @@ -176,16 +176,11 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - median_ttft: 0.1, + p90_ttft: 0.1, median_intvty: 50, } as any), ]; - const result = processOverlayChartData( - data, - 'interactivity', - 'y_inputTputPerGpu', - 'median_ttft', - ); + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.1); }); @@ -195,76 +190,62 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_e2el: 2.5, } as any), ]; const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null); expect(result).toHaveLength(1); - // e2e uses median_e2el as x (from chart config default), not p99_ttft + // e2e uses median_e2el as x (from chart config default), not p90_ttft expect(result[0].x).toBe(2.5); }); - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => { - const data = [ - pt({ - x: 100, - tpPerGpu: { y: 42, roof: false }, - p99_ttft: 0.35, - median_e2el: 2.5, - } as any), - ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); - expect(result).toHaveLength(1); - expect(result[0].x).toBe(0.35); - }); - - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => { + it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => { const data = [ pt({ x: 100, tpPerGpu: { y: 42, roof: false }, - median_ttft: 0.12, + p90_ttft: 0.12, median_e2el: 2.5, } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.12); }); it('filters e2e TTFT outliers exceeding y_latency_limit', () => { const data = [ - pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any), - pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any), + pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any), + pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); it('does not filter interactivity points by latency limit when x-axis is default', () => { - // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity + // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity // chart's x-axis stays median_intvty for non-input metrics. The latency limit // (60) must NOT apply to median_intvty values. const data = [ pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any), pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(2); }); it('applies latency limit on interactivity only when x-axis is actually overridden', () => { - // When an input metric IS selected and x-axis overrides to p99_ttft, + // When an input metric IS selected and x-axis overrides to p90_ttft, // the latency limit should apply. const data = [ - pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any), - pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any), + pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any), + pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft'); - // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999 + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); + // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999 expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4b5335b6..4876c614 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -75,11 +75,13 @@ export function processOverlayChartData( chartType: 'e2e' | 'interactivity', selectedYAxisMetric: string, selectedXAxisMetric: string | null, + options?: { isAgentic?: boolean }, ): InferenceData[] { const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType); if (!chartDef) return []; const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; + const isAgentic = options?.isAgentic === true; // Resolve x-axis field (must match useChartData logic) const metricTitle = @@ -87,9 +89,11 @@ export function processOverlayChartData( const isInputMetric = metricTitle.toLowerCase().includes('input'); let xAxisField: string = chartDef.x; // selectedXAxisMetric is already the effective metric for this chart type - // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric) + // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric). + // Match any *_ttft metric — the x-axis-mode picker can now select any + // percentile (median/p75/p90/p99) depending on sequence kind. const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft'); if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { xAxisField = selectedXAxisMetric; @@ -109,7 +113,12 @@ export function processOverlayChartData( }) .filter( (d) => - xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit, + // Skip the latency limit for the natural x-axis or for agentic + // (long TTFTs are normal there, not overload outliers). + xAxisField === chartDef.x || + isAgentic || + !chartDef.y_latency_limit || + d.x <= chartDef.y_latency_limit, ); return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 4c56d217..ccc371f9 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -19,6 +19,13 @@ export interface TooltipConfig { isTracked?: boolean; /** URL to the GitHub Actions workflow run */ runUrl?: string; + /** + * Per-request ISL/OSL arrays for agentic points, sourced from the stored + * aiperf `profile_export.jsonl`. Used to detect whether the point has any + * trace data (so the "View charts" button can appear); the actual + * distributions are rendered on the detail page, not inline. + */ + traceHistogram?: { isl: number[]; osl: number[] } | undefined; } export interface OverlayTooltipConfig extends TooltipConfig { @@ -88,6 +95,74 @@ const runLinkHTML = (runUrl?: string) => const tooltipLine = (label: string, value: string | number) => `

${label}: ${value}
`; +const formatPct = (v: number | undefined): string | null => + v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; + +/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */ +const fmt = (v: number): string => { + if (!Number.isFinite(v)) return String(v); + const rounded = parseFloat(v.toFixed(3)); + if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded); + return String(rounded); +}; + +/** + * Agentic-only tooltip rows: offload mode, KV cache hit rates, request + * success, token totals. Returns an empty string for non-agentic rows. + */ +const generateAgenticHTML = (d: InferenceData): string => { + if (d.benchmark_type !== 'agentic_traces') return ''; + + const parts: string[] = []; + if (d.offload_mode) { + parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase())); + } + + const gpuHit = formatPct(d.server_gpu_cache_hit_rate); + const cpuHit = formatPct(d.server_cpu_cache_hit_rate); + const theoHit = formatPct(d.theoretical_cache_hit_rate); + if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit)); + if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit)); + if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit)); + + if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) { + const successPct = + d.num_requests_total > 0 + ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)` + : ''; + parts.push( + tooltipLine( + 'Requests', + `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`, + ), + ); + } + + if (d.total_prompt_tokens !== undefined) { + parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens))); + } + if (d.total_generation_tokens !== undefined) { + parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens))); + } + + // Histograms + time-series live on the dedicated detail page now; the + // "View charts" button (rendered by the wrapper when pinned + has trace + // data) takes the user there. + + return parts.join(''); +}; + +/** "View charts" button — only visible when the tooltip is pinned and the + * point has stored trace data. Wired up by the ScatterGraph click handler. */ +const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => { + if (!isPinned || !hasTraceData) return ''; + return ``; +}; + const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…'); const imageTooltipLine = (image: string) => @@ -138,7 +213,16 @@ const generateParallelismHTML = (d: InferenceData): string => { * @returns HTML string for the tooltip content */ export const generateTooltipContent = (config: TooltipConfig): string => { - const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config; + const { + data: d, + isPinned, + xLabel, + yLabel, + selectedYAxisMetric, + hardwareConfig, + runUrl, + traceHistogram, + } = config; return `
@@ -156,16 +240,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => { : '' }
- ${xLabel}: ${formatNumber(d.x)} + ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${ selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu'] ? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)} + Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
` : '' } @@ -173,7 +257,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu'] ? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)} + Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
` : '' } @@ -182,10 +266,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} + ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))} ${ isPinned ? `
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${tooltipLine('Total GPUs', d.tp)} ${generateParallelismHTML(d)}
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`; }; @@ -271,16 +358,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => : '' }
- ${xLabel}: ${formatNumber(d.x)} + ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)} + ${yLabel}: ${fmt(d.y)}
${ selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu'] ? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)} + Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
` : '' } @@ -288,7 +375,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu'] ? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)} + Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
` : '' } @@ -297,9 +384,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)}
`; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index a9e087b2..19b4bfb0 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -5,17 +5,30 @@ import { Info } from 'lucide-react'; import { LabelWithTooltip } from '@/components/ui/label-with-tooltip'; import { track } from '@/lib/analytics'; import { MultiSelect } from '@/components/ui/multi-select'; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectLabel, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; import { TooltipContent, TooltipRoot, TooltipTrigger } from '@/components/ui/tooltip'; import { type Model, type Precision, type Sequence, + type Percentile, + PERCENTILE_OPTIONS, getModelCategory, getModelLabel, + getPercentileLabel, getPrecisionLabel, getSequenceCategory, getSequenceLabel, groupByCategory, + sequenceKind, } from '@/lib/data-mappings'; function DeprecatedSectionTitle({ reason }: { reason: string }) { @@ -200,6 +213,132 @@ export function SequenceSelector({ ); } +interface ScenarioSelectorProps { + id?: string; + value: string; + onChange: (value: Sequence) => void; + open?: boolean; + onOpenChange?: (open: boolean) => void; + availableSequences: string[]; + 'data-testid'?: string; +} + +/** + * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length", + * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL + * framing only applies to the fixed-seq subset). + */ +export function ScenarioSelector({ + id = 'scenario-select', + value, + onChange, + open, + onOpenChange, + availableSequences, + 'data-testid': testId, +}: ScenarioSelectorProps) { + const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq'); + const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic'); + const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence)); + + return ( +
+ + +
+ ); +} + +interface PercentileSelectorProps { + id?: string; + value: string; + onChange: (value: Percentile) => void; + 'data-testid'?: string; +} + +/** + * Latency percentile selector for agentic-trace charts. The selected value + * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so + * picking p99 plots p99 e2e latency / interactivity instead of the median. + */ +export function PercentileSelector({ + id = 'percentile-select', + value, + onChange, + 'data-testid': testId, +}: PercentileSelectorProps) { + return ( +
+ + +
+ ); +} + interface PrecisionSelectorProps { id?: string; value: string[]; diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx index 0392ac10..44013b1b 100644 --- a/packages/app/src/components/ui/d3-chart-wrapper.tsx +++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx @@ -1,6 +1,41 @@ 'use client'; -import React from 'react'; +import React, { useEffect, useState } from 'react'; +import { createPortal } from 'react-dom'; + +/** + * Renders the d3 tooltip element via React Portal to document.body so it + * escapes any parent stacking context (e.g. the chart Card's backdrop-filter + * creates one, trapping z-index inside it). Position is set as viewport + * coordinates by the d3 layer. + */ +function PortalTooltip({ + tooltipRef, + pinned, +}: { + tooltipRef: React.RefObject; + pinned: boolean; +}) { + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const node = ( +
+ ); + if (!mounted || typeof document === 'undefined') return node; + return createPortal(node, document.body); +} export interface D3ChartWrapperProps { chartId: string; @@ -72,17 +107,11 @@ export function D3ChartWrapper({ } }} /> -
+ {/* Tooltip is portalled to with position:fixed so it can + rise above sibling chart cards' stacking contexts. The d3 layer + writes viewport-coords into style.left/top — see + computeTooltipPosition. */} + {noDataOverlay}

{instructions}

diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts index 1863060d..3c24d32b 100644 --- a/packages/app/src/components/unofficial-run-provider.test.ts +++ b/packages/app/src/components/unofficial-run-provider.test.ts @@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */ function stubRow(overrides: Partial = {}): BenchmarkRow { return { + id: 1, hardware: 'h200', framework: 'sglang', model: 'dsr1', @@ -29,6 +30,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 6fd3aba1..dd2b0dbf 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -12,7 +12,7 @@ import { import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types'; import { UnofficialBanner } from '@/components/ui/unofficial-banner'; -import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants'; import { computeToggle } from '@/hooks/useTogglableSet'; import type { BenchmarkRow, EvalRow } from '@/lib/api'; import { normalizeEvalHardwareKey } from '@/lib/chart-utils'; @@ -110,7 +110,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData const groups = new Map(); for (const row of benchmarks) { const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model; - const sequence = islOslToSequence(row.isl, row.osl); + const sequence = rowToSequence(row); if (!sequence) continue; const key = `${displayModel}_${sequence}`; if (!groups.has(key)) groups.set(key, []); diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts new file mode 100644 index 00000000..4ca25ee2 --- /dev/null +++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts @@ -0,0 +1,45 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface MetricPercentiles { + mean: number; + p50: number; + p75: number; + p90: number; + p99: number; + n: number; +} + +export interface AgenticAggregate { + id: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; +} + +export type AgenticAggregateMap = Record; + +async function fetchAgenticAggregates( + ids: number[], + signal?: AbortSignal, +): Promise { + if (ids.length === 0) return {}; + const res = await fetch(`/api/v1/agentic-aggregates?ids=${ids.join(',')}`, { signal }); + if (!res.ok) throw new Error(`agentic-aggregates ${res.status}`); + return (await res.json()) as AgenticAggregateMap; +} + +/** + * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV + * cache utilization, and prefix cache hit rate. Used by the "Aggregates + * across configs" view on the agentic detail page. + */ +export function useAgenticAggregates(ids: number[], enabled = true) { + const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b); + return useQuery({ + queryKey: ['agentic-aggregates', sortedKey.join(',')] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => fetchAgenticAggregates(sortedKey, signal), + enabled: enabled && sortedKey.length > 0, + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts new file mode 100644 index 00000000..1ea90c0d --- /dev/null +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -0,0 +1,46 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface BenchmarkSibling { + id: number; + conc: number; + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + prefill_tp: number; + prefill_ep: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + is_current: boolean; + has_trace: boolean; +} + +export interface BenchmarkSku { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + github_run_id: number; + date: string; +} + +export interface BenchmarkSiblings { + sku: BenchmarkSku; + siblings: BenchmarkSibling[]; +} + +export function useBenchmarkSiblings(id: number | null) { + return useQuery({ + queryKey: ['benchmark-siblings', id] as const, + queryFn: async ({ signal }) => { + const res = await fetch(`/api/v1/benchmark-siblings?id=${id}`, { signal }); + if (res.status === 404) return null; + if (!res.ok) throw new Error(`benchmark-siblings ${res.status}`); + return (await res.json()) as BenchmarkSiblings; + }, + enabled: id !== null && id > 0, + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts index 7329896d..c4f49130 100644 --- a/packages/app/src/hooks/api/use-benchmarks.test.ts +++ b/packages/app/src/hooks/api/use-benchmarks.test.ts @@ -5,12 +5,29 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; describe('benchmarkQueryOptions', () => { it('builds query key from model and date', () => { const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01'); - expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']); + expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest', '']); }); it('builds exact query key when exact=true', () => { const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true); - expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']); + expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', '']); + }); + + it('includes runId in query key when provided', () => { + const opts = benchmarkQueryOptions( + 'DeepSeek-R1-0528', + '2026-03-01', + true, + false, + '26194160120', + ); + expect(opts.queryKey).toEqual([ + 'benchmarks', + 'DeepSeek-R1-0528', + '2026-03-01', + 'latest', + '26194160120', + ]); }); it('produces distinct keys for different models', () => { diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts index 6da1568e..8fd1f4e9 100644 --- a/packages/app/src/hooks/api/use-benchmarks.ts +++ b/packages/app/src/hooks/api/use-benchmarks.ts @@ -8,14 +8,16 @@ export function benchmarkQueryOptions( date: string, enabled = true, exact?: boolean, + runId?: string, ) { return { - queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const, - queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal), + queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? ''] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => + fetchBenchmarks(model, date, exact, signal, runId), enabled: enabled && Boolean(model), }; } -export function useBenchmarks(model: string, date?: string, enabled = true) { - return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled)); +export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) { + return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId)); } diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts new file mode 100644 index 00000000..6bc7ae5e --- /dev/null +++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts @@ -0,0 +1,41 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface DerivedAgenticMetric { + id: number; + /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled + * by mean_load / session_load. Null when the JSONL had no usable records. */ + normalized_session_time_s: number | null; + /** P90 of per-turn ISL/TTFT across every turn in every session. + * Null when no prefill rates could be computed. */ + p90_prefill_tps_per_user: number | null; +} + +export type DerivedAgenticMetricMap = Record; + +async function fetchDerivedAgenticMetrics( + ids: number[], + signal?: AbortSignal, +): Promise { + if (ids.length === 0) return {}; + const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal }); + if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`); + return (await res.json()) as DerivedAgenticMetricMap; +} + +/** + * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user) + * computed live from the stored aiperf profile_export.jsonl. Used to drive + * the "Session Time" and "Prefill TPS/user" chart variants. + * + * Ids without a trace_replay blob (older or non-aiperf agentic runs) are + * silently omitted from the response. + */ +export function useDerivedAgenticMetrics(ids: number[], enabled = true) { + const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b); + return useQuery({ + queryKey: ['derived-agentic-metrics', sortedKey.join(',')] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => fetchDerivedAgenticMetrics(sortedKey, signal), + enabled: enabled && sortedKey.length > 0, + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts new file mode 100644 index 00000000..d3ceaab8 --- /dev/null +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -0,0 +1,59 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface RequestRecord { + /** Conversation id (groups turns of one agent session). */ + cid: string; + /** Zero-based turn index within the conversation. */ + ti: number; + /** Worker id (concurrency slot that handled this request). */ + wid: string; + /** Sub-agent depth (0 = top-level). */ + ad: number; + /** `warmup` or `profiling`. */ + phase: string; + /** ns offset from timeline.startNs. Load gen decided to dispatch. */ + credit: number; + /** ns offset from timeline.startNs. HTTP send started. */ + start: number; + /** ns offset from timeline.startNs. First server acknowledgement (or null). */ + ack: number | null; + /** ns offset from timeline.startNs. Last byte received. */ + end: number; + ttftMs: number | null; + isl: number | null; + osl: number | null; + cancelled: boolean; +} + +export interface RequestTimeline { + version: number; + startNs: number; + endNs: number; + durationS: number; + requests: RequestRecord[]; +} + +async function fetchRequestTimeline( + id: number, + signal?: AbortSignal, +): Promise { + const res = await fetch(`/api/v1/request-timeline?id=${id}`, { signal }); + if (res.status === 404) return null; + if (!res.ok) throw new Error(`request-timeline ${res.status}`); + return (await res.json()) as RequestTimeline; +} + +/** + * Lazy-fetch the per-request Gantt timeline for one agentic point. + * Enabled only when the caller opts in (e.g. the timeline view becomes + * active), so the payload (~30 KB per point) isn't paid for every page load. + */ +export function useRequestTimeline(id: number | null, enabled = false) { + return useQuery({ + queryKey: ['request-timeline', id] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => + id ? fetchRequestTimeline(id, signal) : Promise.resolve(null), + enabled: enabled && Boolean(id), + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts new file mode 100644 index 00000000..db4220d2 --- /dev/null +++ b/packages/app/src/hooks/api/use-trace-histograms.ts @@ -0,0 +1,39 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface TraceHistogramPoint { + id: number; + /** Input sequence length (tokens) per completed request. */ + isl: number[]; + /** Output sequence length (tokens) per completed request. */ + osl: number[]; +} + +export type TraceHistogramMap = Record; + +async function fetchTraceHistograms( + ids: number[], + signal?: AbortSignal, +): Promise { + if (ids.length === 0) return {}; + const res = await fetch(`/api/v1/trace-histograms?ids=${ids.join(',')}`, { signal }); + if (!res.ok) throw new Error(`trace-histograms ${res.status}`); + return (await res.json()) as TraceHistogramMap; +} + +/** + * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values. + * Ids without a stored trace_replay blob are silently omitted from the response. + * + * Caller passes the agentic id set currently on screen; React Query handles + * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so + * any permutation of the same set hits the same cache entry. + */ +export function useTraceHistograms(ids: number[], enabled = true) { + const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b); + return useQuery({ + queryKey: ['trace-histograms', sortedKey.join(',')] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceHistograms(sortedKey, signal), + enabled: enabled && sortedKey.length > 0, + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts new file mode 100644 index 00000000..8418aa4f --- /dev/null +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -0,0 +1,70 @@ +import { useQuery } from '@tanstack/react-query'; + +export interface TimeSeriesPoint { + /** Seconds from benchmark start. */ + t: number; + value: number; +} +export interface QueueDepthPoint { + t: number; + running: number; + waiting: number; + total: number; +} +export interface PointMeta { + id: number; + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + disagg: boolean; + conc: number; + offload_mode: string | null; + isl: number | null; + osl: number | null; + benchmark_type: string; + date: string; + run_url: string | null; + server_gpu_cache_hit_rate: number | null; + server_cpu_cache_hit_rate: number | null; +} + +export interface TraceServerMetrics { + meta: PointMeta; + startNs: number; + endNs: number; + durationS: number; + timeslicesCount: number; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; +} + +async function fetchTraceServerMetrics( + id: number, + signal?: AbortSignal, +): Promise { + const res = await fetch(`/api/v1/trace-server-metrics?id=${id}`, { signal }); + if (res.status === 404) return null; + if (!res.ok) throw new Error(`trace-server-metrics ${res.status}`); + return (await res.json()) as TraceServerMetrics; +} + +/** + * Lazy-fetch parsed server-metric time-series for one agentic point. + * Enabled only when the caller passes `enabled=true` (the detail panel opens), + * so we don't pay the parse cost on every hover. + */ +export function useTraceServerMetrics(id: number | null, enabled = false) { + return useQuery({ + queryKey: ['trace-server-metrics', id] as const, + queryFn: ({ signal }: { signal: AbortSignal }) => + id ? fetchTraceServerMetrics(id, signal) : Promise.resolve(null), + enabled: enabled && Boolean(id), + staleTime: 5 * 60 * 1000, + }); +} diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 999cbfde..31cf906a 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -6,6 +6,8 @@ import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { + /** Stable per-point id from benchmark_results; used to look up trace histograms. */ + id: number; hardware: string; framework: string; model: string; @@ -23,9 +25,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -115,10 +121,13 @@ export function fetchBenchmarks( date?: string, exact?: boolean, signal?: AbortSignal, + /** Optional github_run_id to scope to a specific workflow run. */ + runId?: string, ) { const params = new URLSearchParams({ model }); if (date) params.set('date', date); if (exact) params.set('exact', 'true'); + if (runId) params.set('runId', runId); return fetchJson(`/api/v1/benchmarks?${params}`, signal); } @@ -141,13 +150,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index be76438e..fcbca681 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -6,6 +6,7 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform function makeRow(overrides: Partial = {}): BenchmarkRow { return { + id: 1, hardware: 'h200', framework: 'trt', model: 'dsr1', @@ -23,6 +24,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 64, diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 107f0b12..3594750c 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -15,10 +15,42 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils'; import { getHardwareConfig } from '@/lib/constants'; import type { BenchmarkRow } from '@/lib/api'; +/** + * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl + * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here: + * e2el ≡ ttlt (time-to-last-token == end-to-end latency) + * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) + * intvty ≡ 1/itl (tok/s from the user's perspective) + * Existing fields win if present; we only fill in the gaps. + */ +function agenticAliases(m: Record): Record { + const out: Record = {}; + for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) { + const itl = m[`${suffix}_itl`]; + const ttlt = m[`${suffix}_ttlt`]; + if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; + if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; + if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) { + out[`${suffix}_intvty`] = 1 / itl; + } + } + return out; +} + /** Convert a DB benchmark row to an AggDataEntry. */ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { - const m = row.metrics; + const isAgentic = row.benchmark_type === 'agentic_traces'; + const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics; + // Prefer the dedicated column (added in migration 004); fall back to the + // legacy stash inside `metrics` for any rows ingested before that column + // existed. + const rawMetrics = row.metrics as Record; + const offloadMode = + row.offload_mode ?? + (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined); return { + // Coerce: Postgres bigint comes through the SQL client as a string. + id: typeof row.id === 'number' ? row.id : Number(row.id), hw: row.hardware, framework: row.framework, model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model, @@ -32,23 +64,43 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { mean_ttft: m.mean_ttft ?? 0, median_ttft: m.median_ttft ?? 0, std_ttft: m.std_ttft ?? 0, + p75_ttft: m.p75_ttft ?? 0, + p90_ttft: m.p90_ttft ?? 0, + p95_ttft: m.p95_ttft ?? 0, p99_ttft: m.p99_ttft ?? 0, + 'p99.9_ttft': m['p99.9_ttft'] ?? 0, mean_tpot: m.mean_tpot ?? 0, median_tpot: m.median_tpot ?? 0, std_tpot: m.std_tpot ?? 0, + p75_tpot: m.p75_tpot ?? 0, + p90_tpot: m.p90_tpot ?? 0, + p95_tpot: m.p95_tpot ?? 0, p99_tpot: m.p99_tpot ?? 0, + 'p99.9_tpot': m['p99.9_tpot'] ?? 0, mean_intvty: m.mean_intvty ?? 0, median_intvty: m.median_intvty ?? 0, std_intvty: m.std_intvty ?? 0, + p75_intvty: m.p75_intvty ?? 0, + p90_intvty: m.p90_intvty ?? 0, + p95_intvty: m.p95_intvty ?? 0, p99_intvty: m.p99_intvty ?? 0, + 'p99.9_intvty': m['p99.9_intvty'] ?? 0, mean_itl: m.mean_itl ?? 0, median_itl: m.median_itl ?? 0, std_itl: m.std_itl ?? 0, + p75_itl: m.p75_itl ?? 0, + p90_itl: m.p90_itl ?? 0, + p95_itl: m.p95_itl ?? 0, p99_itl: m.p99_itl ?? 0, + 'p99.9_itl': m['p99.9_itl'] ?? 0, mean_e2el: m.mean_e2el ?? 0, median_e2el: m.median_e2el ?? 0, std_e2el: m.std_e2el ?? 0, + p75_e2el: m.p75_e2el ?? 0, + p90_e2el: m.p90_e2el ?? 0, + p95_e2el: m.p95_e2el ?? 0, p99_e2el: m.p99_e2el ?? 0, + 'p99.9_e2el': m['p99.9_e2el'] ?? 0, disagg: row.disagg, num_prefill_gpu: row.num_prefill_gpu, num_decode_gpu: row.num_decode_gpu, @@ -68,6 +120,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + benchmark_type: row.benchmark_type, + isl: row.isl, + osl: row.osl, + offload_mode: offloadMode, + server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate, + server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate, + theoretical_cache_hit_rate: m.theoretical_cache_hit_rate, + num_requests_total: m.num_requests_total, + num_requests_successful: m.num_requests_successful, + total_prompt_tokens: m.total_prompt_tokens, + total_generation_tokens: m.total_generation_tokens, }; } @@ -77,13 +140,30 @@ interface PreparedEntry { date: string; } +/** + * Rewrite a chart x-axis key to use a different latency percentile prefix + * (`median_` → `p99_` etc). Only touches keys that start with a known + * percentile prefix; leaves everything else alone. + */ +export function withPercentile(key: string, percentile: string): string { + return key.replace(/^(mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`); +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). * * Converts rows to AggDataEntry once, then reuses for each chart definition. + * + * @param percentile Optional latency percentile for the chart x-axis + * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart + * definition for the chosen percentile — only agentic rows carry the + * full set (median/p90/p99/p99.9) so this mainly affects that scenario. */ -export function transformBenchmarkRows(rows: BenchmarkRow[]): { +export function transformBenchmarkRows( + rows: BenchmarkRow[], + percentile = 'median', +): { chartData: InferenceData[][]; hardwareConfig: HardwareConfig; } { @@ -109,13 +189,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): { // Phase 2: Build chart data per chart definition (reusing prepared entries) const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => { + const xKey = withPercentile(chartDef.x, percentile); const groupedByHw: Record = {}; for (const { entry, hwKey, date } of prepared) { const dataPoint = createChartDataPoint( date, entry, - chartDef.x as keyof AggDataEntry, + xKey as keyof AggDataEntry, chartDef.y as keyof AggDataEntry, hwKey, ); diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts index f0f1ef5b..da81ca0e 100644 --- a/packages/app/src/lib/compare-pair-defaults.test.ts +++ b/packages/app/src/lib/compare-pair-defaults.test.ts @@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults'; function makeRow(overrides: Partial): BenchmarkRow { return { + id: 1, hardware: 'h100', framework: 'sglang', model: 'dsr1', @@ -30,6 +31,8 @@ function makeRow(overrides: Partial): BenchmarkRow { metrics: { tput_per_gpu: 100 }, date: '2026-01-01', run_url: null, + benchmark_type: 'single_turn', + offload_mode: 'off', ...overrides, }; } diff --git a/packages/app/src/lib/compare-pair-defaults.ts b/packages/app/src/lib/compare-pair-defaults.ts index be6450ad..f5a37e1f 100644 --- a/packages/app/src/lib/compare-pair-defaults.ts +++ b/packages/app/src/lib/compare-pair-defaults.ts @@ -14,6 +14,7 @@ export function pickPairDefaults( const seenB = new Map>(); for (const row of rows) { if (row.hardware !== a && row.hardware !== b) continue; + if (row.isl === null || row.osl === null) continue; const seq = islOslToSequence(row.isl, row.osl); if (!seq) continue; const key = `${seq}|${row.precision}`; diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index a3d28315..421ac69b 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -61,17 +61,33 @@ export function renderScatterPoints { + text + .append('tspan') + .attr('x', 0) + .attr('dy', i === 0 ? `${firstDy}em` : '1.1em') + .text(line); + }); + }); } // Exit: remove stale points @@ -128,20 +144,32 @@ export function renderScatterPoints('.point-label') + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .selectAll('.point-label') .data([true]) .join('text') .attr('class', 'point-label') - .attr('dy', -8) .attr('text-anchor', 'middle') .attr('fill', config.foreground!) .attr('font-size', '10px') - .attr('pointer-events', 'none') - .text(config.getLabelText!(d)); + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + const firstDy = -(0.8 + (lines.length - 1) * 1.1); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) + .text((l) => l); }); } else { points.selectAll('.point-label').remove(); @@ -261,7 +289,21 @@ export function attachScatterTooltipHandlers< }); } -/** Compute tooltip left/top, flipping when it would overflow the chart container. */ +/** + * Compute tooltip left/top **in viewport coordinates** so the tooltip can be + * rendered via portal with `position: fixed`. Callers still pass cursor coords + * relative to `container` (matching `d3.pointer(event, container)`). + * + * Why viewport coords: the chart cards use `backdrop-filter`, which creates + * a stacking context. A tooltip painted inside the upper card's stacking + * context cannot rise above the lower card's stacking context regardless of + * its z-index. Portalling to document.body + `position: fixed` sidesteps the + * whole problem; we just need the coordinates in viewport space. + * + * Strategy: pick preferred side (right/below cursor), flip if it overflows the + * container, then clamp to container bounds. Tall tooltips that don't fit get + * clamped to the container edges. + */ export function computeTooltipPosition( mx: number, my: number, @@ -280,13 +322,21 @@ export function computeTooltipPosition( // Force reflow so we get real dimensions const tw = node.getBoundingClientRect().width || node.offsetWidth; const th = node.getBoundingClientRect().height || node.offsetHeight; + const rect = container.getBoundingClientRect(); const cw = container.clientWidth; const ch = container.clientHeight; + const EDGE_PAD = 4; + + // Prefer right of cursor; flip to left if no room. + let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw; + left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left)); - const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset; - const top = my + offset + th > ch ? my - offset - th : my + offset; + // Prefer below cursor; flip above if no room. + let top = my + offset + th <= ch ? my + offset : my - offset - th; + top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top)); - return { left, top }; + // Convert container-local coords → viewport coords for `position: fixed`. + return { left: left + rect.left, top: top + rect.top }; } /** Update scatter point positions on zoom. */ diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 6a543925..c18266ba 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -138,17 +138,73 @@ export enum Sequence { OneK_OneK = '1k/1k', OneK_EightK = '1k/8k', EightK_OneK = '8k/1k', + AgenticTraces = 'agentic-traces', } -const SEQUENCE_CONFIG: Record = - { - [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' }, - [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' }, - [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' }, - }; +/** + * Top-level scenario kind. Fixed-seq sequences cluster under a single group + * in the selector; agentic traces sit alongside as their own kind. + */ +export type ScenarioKind = 'fixed-seq' | 'agentic'; + +export function sequenceKind(seq: Sequence): ScenarioKind { + return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq'; +} + +const SEQUENCE_CONFIG: Record< + Sequence, + { label: string; compact: string; category: CategoryTag; kind: ScenarioKind } +> = { + [Sequence.OneK_OneK]: { + label: '1K / 1K', + compact: '1k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.OneK_EightK]: { + label: '1K / 8K', + compact: '1k8k', + category: 'deprecated', + kind: 'fixed-seq', + }, + [Sequence.EightK_OneK]: { + label: '8K / 1K', + compact: '8k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.AgenticTraces]: { + label: 'Agentic Traces', + compact: 'agentic', + category: 'default', + kind: 'agentic', + }, +}; export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; +/** + * Percentile of the latency distribution used for the chart x-axis when + * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9 + * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl); + * p75 and p90 are surfaced in the UI. + */ +export enum Percentile { + P75 = 'p75', + P90 = 'p90', +} + +const PERCENTILE_CONFIG: Record = { + [Percentile.P75]: { label: 'p75' }, + [Percentile.P90]: { label: 'p90' }, +}; + +export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; + +export function getPercentileLabel(p: Percentile): string { + return PERCENTILE_CONFIG[p]?.label ?? p; +} + export const DEPRECATED_SEQUENCES: ReadonlySet = new Set( (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][]) .filter(([, c]) => c.category === 'deprecated') diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts index 28cc1e36..2f5844c1 100644 --- a/packages/app/src/lib/energy-metrics.test.ts +++ b/packages/app/src/lib/energy-metrics.test.ts @@ -57,23 +57,43 @@ function makeEntry(overrides: Partial = {}): AggDataEntry { mean_ttft: 0.5, median_ttft: 0.4, std_ttft: 0.1, + p75_ttft: 0.65, + p90_ttft: 0.7, + p95_ttft: 0.75, p99_ttft: 0.8, + 'p99.9_ttft': 0.9, mean_tpot: 0.02, mean_intvty: 45, median_tpot: 0.02, median_intvty: 44, std_tpot: 0.005, std_intvty: 5, + p75_tpot: 0.022, + p75_intvty: 50, + p90_tpot: 0.025, + p90_intvty: 55, + p95_tpot: 0.028, + p95_intvty: 58, p99_tpot: 0.03, p99_intvty: 60, + 'p99.9_tpot': 0.035, + 'p99.9_intvty': 65, mean_itl: 0.01, median_itl: 0.01, std_itl: 0.002, + p75_itl: 0.012, + p90_itl: 0.013, + p95_itl: 0.014, p99_itl: 0.015, + 'p99.9_itl': 0.018, mean_e2el: 5, median_e2el: 4.8, std_e2el: 0.5, + p75_e2el: 5.2, + p90_e2el: 5.5, + p95_e2el: 5.8, p99_e2el: 6, + 'p99.9_e2el': 6.5, disagg: false, num_prefill_gpu: 0, num_decode_gpu: 0, diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts index ebaa5336..73cbe0b7 100644 --- a/packages/app/src/lib/url-state.ts +++ b/packages/app/src/lib/url-state.ts @@ -22,8 +22,10 @@ const URL_STATE_KEYS = [ 'i_seq', 'i_prec', 'i_metric', + 'i_pctl', 'i_xmetric', 'i_e2e_xmetric', + 'i_xmode', 'i_scale', 'i_gpus', 'i_dates', @@ -66,8 +68,10 @@ export const PARAM_DEFAULTS: Record = { i_seq: '8k/1k', i_prec: 'fp4', i_metric: 'y_tpPerGpu', - i_xmetric: 'p99_ttft', - i_e2e_xmetric: '', + i_pctl: 'p90', + i_xmetric: 'p90_ttft', + i_e2e_xmetric: 'p90_ttft', + i_xmode: '', i_scale: 'auto', i_gpus: '', i_dates: '', diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts index cc5eb6b4..e23a93bc 100644 --- a/packages/constants/src/framework-aliases.ts +++ b/packages/constants/src/framework-aliases.ts @@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record = { ]), ), mtp: 'MTP', + aiperf: 'AIPerf', }; /** diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts index cf2c4d0b..70e50f96 100644 --- a/packages/constants/src/metric-keys.ts +++ b/packages/constants/src/metric-keys.ts @@ -1,46 +1,110 @@ /** * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column. * - * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU. + * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are + * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment. + * + * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency, + * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs + * carry median/mean/p99/std for latency only. */ export const METRIC_KEYS = new Set([ // throughput (tokens/sec/GPU) 'tput_per_gpu', 'output_tput_per_gpu', 'input_tput_per_gpu', + // throughput (tokens/sec, deployment total) — agentic aiperf reports both + 'total_tput_tps', + 'output_tput_tps', + 'input_tput_tps', // TTFT — time to first token 'median_ttft', 'mean_ttft', + 'p75_ttft', 'p90_ttft', + 'p95_ttft', 'p99_ttft', 'p99.9_ttft', 'std_ttft', // TPOT — time per output token 'median_tpot', 'mean_tpot', + 'p75_tpot', 'p90_tpot', + 'p95_tpot', 'p99_tpot', 'p99.9_tpot', 'std_tpot', // ITL — inter-token latency 'median_itl', 'mean_itl', + 'p75_itl', 'p90_itl', + 'p95_itl', 'p99_itl', 'p99.9_itl', 'std_itl', // E2EL — end-to-end latency 'median_e2el', 'mean_e2el', + 'p75_e2el', 'p90_e2el', + 'p95_e2el', 'p99_e2el', 'p99.9_e2el', 'std_e2el', // interactivity 'median_intvty', 'mean_intvty', + 'p75_intvty', 'p90_intvty', + 'p95_intvty', 'p99_intvty', 'p99.9_intvty', 'std_intvty', + // QPS — queries per second (agentic aiperf) + 'median_qps', + 'mean_qps', + 'p75_qps', + 'p90_qps', + 'p95_qps', + 'p99_qps', + 'p99.9_qps', + 'std_qps', + // per-request input token count distribution + 'median_input_tokens', + 'mean_input_tokens', + 'p75_input_tokens', + 'p90_input_tokens', + 'p95_input_tokens', + 'p99_input_tokens', + 'p99.9_input_tokens', + 'std_input_tokens', + // per-request output token count distribution — actual served + 'median_output_tokens_actual', + 'mean_output_tokens_actual', + 'p75_output_tokens_actual', + 'p90_output_tokens_actual', + 'p95_output_tokens_actual', + 'p99_output_tokens_actual', + 'p99.9_output_tokens_actual', + 'std_output_tokens_actual', + // per-request output token count distribution — expected from trace + 'median_output_tokens_expected', + 'mean_output_tokens_expected', + 'p75_output_tokens_expected', + 'p90_output_tokens_expected', + 'p95_output_tokens_expected', + 'p99_output_tokens_expected', + 'p99.9_output_tokens_expected', + 'std_output_tokens_expected', + // run totals (agentic aiperf) + 'duration_seconds', + 'total_requests_completed', + 'total_prompt_tokens', + 'total_generation_tokens', + // server prefix-cache observability (agentic aiperf) + 'server_gpu_cache_hit_rate', + 'server_cpu_cache_hit_rate', + 'theoretical_cache_hit_rate', ]); diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts index c75034c7..783d239d 100644 --- a/packages/constants/src/models.ts +++ b/packages/constants/src/models.ts @@ -54,3 +54,20 @@ export function islOslToSequence(isl: number, osl: number): string | null { }; return map[`${isl}_${osl}`] ?? null; } + +/** + * Map a benchmark/availability row to its sequence (scenario) string. + * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl. + * - Other rows (today: `single_turn`) fall back to `islOslToSequence`. + * Returns `null` for rows that can't be classified (e.g. `single_turn` with + * unmapped isl/osl values). + */ +export function rowToSequence(row: { + isl: number | null; + osl: number | null; + benchmark_type: string; +}): string | null { + if (row.benchmark_type === 'agentic_traces') return 'agentic-traces'; + if (row.isl === null || row.osl === null) return null; + return islOslToSequence(row.isl, row.osl); +} diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql new file mode 100644 index 00000000..c143914e --- /dev/null +++ b/packages/db/migrations/002_agentic_scenario.sql @@ -0,0 +1,30 @@ +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql new file mode 100644 index 00000000..e96cbd50 --- /dev/null +++ b/packages/db/migrations/003_agentic_availability.sql @@ -0,0 +1,21 @@ +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql new file mode 100644 index 00000000..24b617f1 --- /dev/null +++ b/packages/db/migrations/004_offload_mode.sql @@ -0,0 +1,42 @@ +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql new file mode 100644 index 00000000..398bc725 --- /dev/null +++ b/packages/db/migrations/006_agentic_trace_replay.sql @@ -0,0 +1,34 @@ +-- Capture raw aiperf trace files per agentic benchmark point. +-- +-- The aiperf harness produces two per-point export files inside each +-- `agentic_` artifact: +-- - profile_export.jsonl (~2 MB raw, per-request data) +-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) +-- +-- We persist them so the dashboard can later show per-request distributions, +-- KV cache utilization over time, and conversation traces without needing to +-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at +-- ~500 KB per point post-gzip the total fits comfortably without a separate +-- blob service. +-- +-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK +-- column on benchmark_results). Older, non-aiperf agentic runs simply have a +-- NULL `trace_replay_id`. + +create table agentic_trace_replay ( + id bigserial primary key, + -- gzip(profile_export.jsonl); null when only the server metrics file existed + profile_export_jsonl_gz bytea, + profile_export_uncompressed_size bigint, + -- raw csv bytes; null when only the profile file existed + server_metrics_csv bytea, + server_metrics_csv_size bigint, + created_at timestamptz not null default now() +); + +alter table benchmark_results + add column trace_replay_id bigint references agentic_trace_replay(id); + +create index benchmark_results_trace_replay_idx + on benchmark_results (trace_replay_id) + where trace_replay_id is not null; diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql new file mode 100644 index 00000000..ba7bd095 --- /dev/null +++ b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql @@ -0,0 +1,17 @@ +-- Add the full server-metrics time-series JSON to agentic_trace_replay. +-- +-- The existing `server_metrics_csv` column holds aiperf's summary export — +-- one row per metric with avg/min/max/std/p1..p99 across the entire run. +-- That's enough for the cumulative cache-hit number but not for any +-- "metric over time" view (KV cache utilization curve, queue depth, prefix +-- hit rate per interval, cumulative prefill token source). +-- +-- The harness also writes `server_metrics_export.json` which contains the +-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole +-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x +-- to ~6 MB gzipped (text with repeated metric names + numeric values). +-- That's the file we store here for any future time-series chart. + +alter table agentic_trace_replay + add column server_metrics_json_gz bytea, + add column server_metrics_json_uncompressed_size bigint; diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql new file mode 100644 index 00000000..d55533b9 --- /dev/null +++ b/packages/db/migrations/008_agentic_aggregate_stats.sql @@ -0,0 +1,18 @@ +-- Pre-computed aggregate stats for each agentic_trace_replay row. +-- +-- Previously the agentic detail page parsed the (huge) profile_export.jsonl +-- and server_metrics_json blobs on every request to compute distribution +-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived +-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the +-- worst rows (high-conc TP+EP server_metrics blobs that decompress past +-- Node's 512 MB string cap) couldn't be parsed without a stream fallback. +-- +-- This column holds the computed stats so the API serves the page from a +-- single SQL row read. Shape mirrors the existing benchmark_results.metrics +-- JSONB convention; an inner `version` field lets the backfill script +-- detect rows whose stats were computed by an older algorithm and +-- recompute them. Null when stats haven't been computed yet (existing +-- rows pre-backfill; the API has a slow-path fallback for that case). + +alter table agentic_trace_replay + add column aggregate_stats jsonb; diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql new file mode 100644 index 00000000..b42718b9 --- /dev/null +++ b/packages/db/migrations/009_agentic_chart_series.sql @@ -0,0 +1,19 @@ +-- Pre-computed time-series for the agentic detail page chart. +-- +-- Sibling to `aggregate_stats` (migration 008): that column stores +-- per-row percentile/derived *summaries*, this one stores the full +-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, +-- queueDepth, prefillTps, decodeTps, promptTokensBySource). +-- +-- Without this, the detail page parsed the entire `server_metrics_json_gz` +-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc +-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). +-- With pre-computed series the page is a single SQL row read. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored series were produced by an older algorithm. +-- Null when the series haven't been computed yet; the API has a slow-path +-- fallback (with stream-parse for oversized blobs) for that case. + +alter table agentic_trace_replay + add column chart_series jsonb; diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql new file mode 100644 index 00000000..756b775e --- /dev/null +++ b/packages/db/migrations/010_agentic_request_timeline.sql @@ -0,0 +1,15 @@ +-- Pre-computed per-request timeline for the agentic detail page. +-- +-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one +-- holds a thin per-request array extracted from `profile_export_jsonl_gz` +-- so the detail page can render a Gantt-style swimlane of every request +-- (one bar per conversation turn) without re-parsing the JSONL on every +-- page load. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored timeline was produced by an older +-- algorithm. Null when the timeline hasn't been computed yet; the API +-- falls back to parsing the blob in that case. + +alter table agentic_trace_replay + add column request_timeline jsonb; diff --git a/packages/db/package.json b/packages/db/package.json index c849ea26..710089f1 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -19,6 +19,9 @@ "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts", "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts", "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts", + "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts", + "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts", + "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts", "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts", "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts", "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts", @@ -30,11 +33,14 @@ "@neondatabase/serverless": "^1.1.0", "@noble/ciphers": "^2.2.0", "@semianalysisai/inferencex-constants": "workspace:*", - "postgres": "^3.4.9" + "postgres": "^3.4.9", + "stream-chain": "^3.4.0", + "stream-json": "^2.1.0" }, "devDependencies": { "@types/adm-zip": "^0.5.8", "@types/node": "^25.7.0", + "@types/stream-json": "^1.7.8", "@vitest/coverage-v8": "^4.1.6", "adm-zip": "^0.5.17", "dotenv-cli": "^11.0.0", diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts new file mode 100644 index 00000000..8dd42dce --- /dev/null +++ b/packages/db/src/backfill-aggregate-stats.ts @@ -0,0 +1,150 @@ +/** + * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it + * or were computed by an older `STATS_VERSION`. + * + * The ingest path now computes stats inline, but existing rows (and rows + * whose computation logic has since changed) still need this pass. Run after + * applying migration 008 and any time `STATS_VERSION` bumps. + * + * Strategy: + * - Stream rows one at a time (server_metrics_json_gz can be hundreds of + * MB decompressed for TP+EP / high-conc points — keeping one in memory + * at a time avoids OOM). + * - Skip rows whose stored `aggregate_stats.version` already matches. + * - Recompute via the same `computeAggregateStats()` helper the ingest + * path uses, so behavior cannot drift. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats + * [--limit N] only process the first N candidate rows (useful for + * smoke-tests on a fresh deploy) + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js'; +import { createAdminSql } from './etl/db-utils.js'; + +interface CliFlags { + limit: number | null; + force: boolean; +} + +function parseFlags(): CliFlags { + let limit: number | null = null; + let force = false; + for (let i = 2; i < process.argv.length; i++) { + const arg = process.argv[i]!; + if (arg === '--force') force = true; + else if (arg === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit, force }; +} + +const flags = parseFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-aggregate-stats ==='); + console.log(` STATS_VERSION = ${STATS_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Find candidates: rows missing stats, or whose stored version is stale. + // Using >>'version'::int comparison would error on null; coalesce to -1 so + // null-stats rows always count as stale. + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where aggregate_stats is null + or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION} + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + console.log(`\n ${candidates.length} candidate row(s).`); + if (!hasYesFlag()) { + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) { + console.log('Aborted.'); + return; + } + } + + let ok = 0; + let failed = 0; + const t0 = Date.now(); + for (const { id } of candidates) { + const start = Date.now(); + try { + // Fetch one row at a time — the json_gz blob is the heavy field. + const [row] = await sql< + { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[] + >` + select profile_export_jsonl_gz, server_metrics_json_gz + from agentic_trace_replay + where id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + continue; + } + + const stats = await computeAggregateStats({ + profileBlob: row.profile_export_jsonl_gz, + serverBlob: row.server_metrics_json_gz, + }); + + await sql` + update agentic_trace_replay + set aggregate_stats = ${sql.json(structuredClone(stats) as unknown as Parameters[0])} + where id = ${id} + `; + ok++; + const elapsed = Math.round((Date.now() - start) / 1000); + const elapsedTotal = Math.round((Date.now() - t0) / 1000); + console.log( + ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`, + ); + } catch (error) { + failed++; + console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`); + if (failed > 0) process.exitCode = 1; +} + +main() + .catch((error) => { + console.error('backfill-aggregate-stats failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts new file mode 100644 index 00000000..66156b45 --- /dev/null +++ b/packages/db/src/backfill-chart-series.ts @@ -0,0 +1,154 @@ +/** + * Backfill `agentic_trace_replay.chart_series` for rows that are missing it + * or were computed by an older `CHART_SERIES_VERSION`. + * + * The ingest path now computes the time-series inline, but existing rows + * (and rows whose computation logic has since changed) still need this + * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION` + * bumps. + * + * Strategy: + * - Stream rows one at a time (server_metrics_json_gz can decompress + * past 500 MB on high-conc TP+EP points — one in memory at a time + * avoids OOM). + * - Skip rows whose stored version already matches. + * - Recompute via the same `computeChartSeries()` helper the ingest + * path uses, so behavior cannot drift. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series + * [--limit N] only process the first N candidate rows + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js'; +import { createAdminSql } from './etl/db-utils.js'; + +interface CliFlags { + limit: number | null; + force: boolean; +} + +function parseFlags(): CliFlags { + let limit: number | null = null; + let force = false; + for (let i = 2; i < process.argv.length; i++) { + const arg = process.argv[i]!; + if (arg === '--force') force = true; + else if (arg === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit, force }; +} + +const flags = parseFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-chart-series ==='); + console.log(` CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Only rows that actually have a server_metrics blob can produce a + // chart_series. Rows without the blob legitimately keep `chart_series` + // null and the API serves them via the slow path (which also returns + // null because there's no blob to parse — so the page falls into the + // "no stored trace_replay blob" branch). + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where server_metrics_json_gz is not null + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where server_metrics_json_gz is not null + and ( + chart_series is null + or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION} + ) + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + console.log(`\n ${candidates.length} candidate row(s).`); + if (!hasYesFlag()) { + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) { + console.log('Aborted.'); + return; + } + } + + let ok = 0; + let failed = 0; + const t0 = Date.now(); + for (const { id } of candidates) { + const start = Date.now(); + try { + const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>` + select server_metrics_json_gz + from agentic_trace_replay + where id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + continue; + } + + const series = await computeChartSeries(row.server_metrics_json_gz); + + await sql` + update agentic_trace_replay + set chart_series = ${ + series === null + ? null + : sql.json(structuredClone(series) as unknown as Parameters[0]) + } + where id = ${id} + `; + ok++; + const elapsed = Math.round((Date.now() - start) / 1000); + const elapsedTotal = Math.round((Date.now() - t0) / 1000); + console.log( + ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`, + ); + } catch (error) { + failed++; + console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`); + if (failed > 0) process.exitCode = 1; +} + +main() + .catch((error) => { + console.error('backfill-chart-series failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts new file mode 100644 index 00000000..327099d0 --- /dev/null +++ b/packages/db/src/backfill-request-timeline.ts @@ -0,0 +1,144 @@ +/** + * Backfill `agentic_trace_replay.request_timeline` for rows that are + * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`. + * + * The ingest path now computes the timeline inline, but existing rows + * (and rows whose computation logic has since changed) still need this + * pass. Run after applying migration 010 and any time the version bumps. + * + * Usage: + * pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline + * [--limit N] only process the first N candidate rows + * [--force] recompute every row, even if version already matches + * [--yes] skip the confirmation prompt + */ + +import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; +import { + REQUEST_TIMELINE_VERSION, + computeRequestTimeline, +} from './etl/compute-request-timeline.js'; +import { createAdminSql } from './etl/db-utils.js'; + +interface CliFlags { + limit: number | null; + force: boolean; +} + +function parseFlags(): CliFlags { + let limit: number | null = null; + let force = false; + for (let i = 2; i < process.argv.length; i++) { + const arg = process.argv[i]!; + if (arg === '--force') force = true; + else if (arg === '--limit') { + const next = process.argv[++i]; + if (!next || Number.isNaN(Number(next))) { + console.error('--limit requires a numeric argument'); + process.exit(1); + } + limit = Number(next); + } + } + return { limit, force }; +} + +const flags = parseFlags(); + +const sql = createAdminSql({ + noSsl: hasNoSslFlag(), + max: 1, + onnotice: () => {}, +}); + +async function main(): Promise { + console.log('=== backfill-request-timeline ==='); + console.log(` REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`); + console.log(` force = ${flags.force}`); + console.log(` limit = ${flags.limit ?? 'none'}`); + + // Only rows with a profile_export blob can produce a timeline. Rows + // without the blob keep `request_timeline` null and the API serves them + // as "no timeline data". + const candidates = flags.force + ? await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where profile_export_jsonl_gz is not null + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + ` + : await sql<{ id: number }[]>` + select id + from agentic_trace_replay + where profile_export_jsonl_gz is not null + and ( + request_timeline is null + or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION} + ) + order by id + ${flags.limit ? sql`limit ${flags.limit}` : sql``} + `; + + if (candidates.length === 0) { + console.log('\n Nothing to do — all rows up to date.'); + return; + } + + console.log(`\n ${candidates.length} candidate row(s).`); + if (!hasYesFlag()) { + const ok = await confirm('\nProceed? (y/N) '); + if (!ok) { + console.log('Aborted.'); + return; + } + } + + let ok = 0; + let failed = 0; + const t0 = Date.now(); + for (const { id } of candidates) { + const start = Date.now(); + try { + const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>` + select profile_export_jsonl_gz + from agentic_trace_replay + where id = ${id} + `; + if (!row) { + console.warn(` id=${id}: row vanished, skipping`); + continue; + } + const timeline = computeRequestTimeline(row.profile_export_jsonl_gz); + await sql` + update agentic_trace_replay + set request_timeline = ${ + timeline === null + ? null + : sql.json(structuredClone(timeline) as unknown as Parameters[0]) + } + where id = ${id} + `; + ok++; + const elapsed = Math.round((Date.now() - start) / 1000); + const elapsedTotal = Math.round((Date.now() - t0) / 1000); + console.log( + ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`, + ); + } catch (error) { + failed++; + console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + const totalSec = Math.round((Date.now() - t0) / 1000); + console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`); + if (failed > 0) process.exitCode = 1; +} + +main() + .catch((error) => { + console.error('backfill-request-timeline failed:', error); + process.exitCode = 1; + }) + .finally(() => sql.end()); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 67173c64..ea802d3f 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows( // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears // more than once in a single batch. Deduplicate within the batch, keeping - // the last occurrence (last metrics for each unique config/isl/osl/conc). + // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode). const seen = new Map(); - for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r); + for (const r of rows) { + seen.set( + `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`, + r, + ); + } const deduped = [...seen.values()]; const configIds = deduped.map((r) => r.configId); + const benchmarkTypes = deduped.map((r) => r.benchmarkType); + const offloadModes = deduped.map((r) => r.offloadMode); const isls = deduped.map((r) => r.isl); const osls = deduped.map((r) => r.osl); const concs = deduped.map((r) => r.conc); @@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows( const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( - workflow_run_id, config_id, benchmark_type, date, + workflow_run_id, config_id, benchmark_type, offload_mode, date, isl, osl, conc, image, metrics ) select ${workflowRunId}, unnest(${sql.array(configIds)}::int[]), - 'single_turn', + unnest(${sql.array(benchmarkTypes)}::text[]), + unnest(${sql.array(offloadModes)}::text[]), ${date}::date, unnest(${sql.array(isls)}::int[]), unnest(${sql.array(osls)}::int[]), unnest(${sql.array(concs)}::int[]), unnest(${sql.array(images)}), unnest(${sql.array(metricsJsons)}::jsonb[]) - on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) + on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set metrics = excluded.metrics, image = excluded.image @@ -147,13 +155,14 @@ export async function bulkUpsertAvailability( sql: Sql, rows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[], date: string, ): Promise { @@ -162,7 +171,7 @@ export async function bulkUpsertAvailability( const seen = new Set(); const unique: typeof rows = []; for (const r of rows) { - const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`; + const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`; if (!seen.has(key)) { seen.add(key); unique.push(r); @@ -170,7 +179,7 @@ export async function bulkUpsertAvailability( } await sql` - insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date) + insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date) select unnest(${sql.array(unique.map((r) => r.model))}::text[]), unnest(${sql.array(unique.map((r) => r.isl))}::int[]), @@ -180,6 +189,7 @@ export async function bulkUpsertAvailability( unnest(${sql.array(unique.map((r) => r.framework))}::text[]), unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]), unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]), + unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]), ${date}::date on conflict do nothing `; diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 7d78e175..1aff5ea9 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // agentic scenario + 'scenario_type', + 'users', + 'offload_mode', + 'num_requests_total', + 'num_requests_successful', ]); +/** + * `benchmark_type` values understood by the ingest. + * - `single_turn` — fixed sequence-length runs (isl/osl set). + * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc). + */ +export type BenchmarkType = 'single_turn' | 'agentic_traces'; + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set(); export interface BenchmarkParams { config: ConfigParams; - isl: number; - osl: number; + benchmarkType: BenchmarkType; + // Null for agentic_traces; present for single_turn. + isl: number | null; + osl: number | null; conc: number; + /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */ + offloadMode: string; image: string | null; metrics: Record; } @@ -114,14 +131,45 @@ export function mapBenchmarkRow( return null; } - const isl = parseInt2(row.isl) ?? islOslFallback?.isl; - const osl = parseInt2(row.osl) ?? islOslFallback?.osl; - const conc = parseInt2(row.conc); - if (!isl || !osl || !conc) { + // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants), + // no isl/osl, and `users` instead of `conc`. Everything else stays as-is. + const isAgentic = String(row.scenario_type ?? '').startsWith('agentic'); + const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn'; + + const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); + const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); + // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones. + const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc); + if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } + // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from + // every runner, including ones with 0 successful requests and null metrics. + // Without this skip, the empty row's nulls overwrite a good row via + // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). + if ( + typeof row.num_requests_successful === 'number' && + row.num_requests_successful === 0 && + typeof row.num_requests_total === 'number' && + row.num_requests_total > 0 + ) { + tracker.skips.failedRun++; + return null; + } + + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` + // ('none' → 'off'; any other non-empty value → 'on'). + const offloadModeRaw = + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : typeof row.offloading === 'string' && row.offloading.length > 0 + ? row.offloading === 'none' + ? 'off' + : 'on' + : 'off'; + const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); const precision = normalizePrecision(String(row.precision ?? '')); @@ -182,6 +230,12 @@ export function mapBenchmarkRow( } } + // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) + // — preserve as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic) { + (metrics as Record).offload_mode = offloadModeRaw; + } + // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; @@ -205,9 +259,11 @@ export function mapBenchmarkRow( numPrefillGpu, numDecodeGpu, }, + benchmarkType, isl, osl, conc, + offloadMode: offloadModeRaw, image, metrics, }; diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts new file mode 100644 index 00000000..de0009de --- /dev/null +++ b/packages/db/src/etl/compute-aggregate-stats.test.ts @@ -0,0 +1,123 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js'; + +/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */ +function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) { + const lines = requests.map((r, i) => + JSON.stringify({ + metadata: { + benchmark_phase: 'profiling', + conversation_id: `conv-${i}`, + turn_index: 0, + }, + metrics: { + input_sequence_length: { value: r.isl, unit: 'tokens' }, + output_sequence_length: { value: r.osl, unit: 'tokens' }, + request_latency: { value: r.rl ?? 1000, unit: 'ms' }, + time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' }, + }, + }), + ); + return gzipSync(Buffer.from(lines.join('\n'))); +} + +/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */ +function makeServerBlob() { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1, avg: 0.2 }, + { start_ns: 1, end_ns: 2, avg: 0.5 }, + { start_ns: 2, end_ns: 3, avg: 0.8 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }], + }, + 'vllm:prefix_cache_queries': { + series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }], + }, + }, + }); + return gzipSync(Buffer.from(json)); +} + +describe('computeAggregateStats', () => { + it('returns the current STATS_VERSION in the bundle', async () => { + const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null }); + expect(stats.version).toBe(STATS_VERSION); + }); + + it('leaves every metric null when both blobs are null', async () => { + const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null }); + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.kvCacheUtil).toBeNull(); + expect(stats.prefixCacheHitRate).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + }); + + it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => { + const profileBlob = makeProfileBlob([ + { isl: 100, osl: 50, rl: 1000, ttft: 100 }, + { isl: 200, osl: 75, rl: 2000, ttft: 200 }, + { isl: 300, osl: 100, rl: 3000, ttft: 300 }, + ]); + const stats = await computeAggregateStats({ profileBlob, serverBlob: null }); + + expect(stats.isl?.n).toBe(3); + expect(stats.isl?.mean).toBeCloseTo(200, 6); + expect(stats.osl?.n).toBe(3); + expect(stats.osl?.mean).toBeCloseTo(75, 6); + + // Server-side metrics still null when there's no server blob. + expect(stats.kvCacheUtil).toBeNull(); + expect(stats.prefixCacheHitRate).toBeNull(); + + // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000. + expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6); + // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean. + // loads = [150, 275, 400], mean_load = 275 + // scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625] + // mean ≈ 1.9653 + expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3); + }); + + it('computes KV util + prefix hit rate from the server blob alone', async () => { + const stats = await computeAggregateStats({ + profileBlob: null, + serverBlob: makeServerBlob(), + }); + expect(stats.kvCacheUtil?.n).toBe(3); + expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6); + expect(stats.prefixCacheHitRate?.n).toBe(1); + expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6); + + // Profile-derived metrics absent. + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + }); + + it('tolerates a malformed profile blob by leaving its metrics null', async () => { + // A random non-gzip buffer triggers a gunzip error — code path swallows it. + const garbage = Buffer.from('not-gzip-data'); + const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null }); + expect(stats.isl).toBeNull(); + expect(stats.osl).toBeNull(); + expect(stats.normalizedSessionTimeS).toBeNull(); + expect(stats.p90PrefillTpsPerUser).toBeNull(); + // Version still set so the row is considered "computed". + expect(stats.version).toBe(STATS_VERSION); + }); +}); diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts new file mode 100644 index 00000000..a422cfec --- /dev/null +++ b/packages/db/src/etl/compute-aggregate-stats.ts @@ -0,0 +1,147 @@ +/** + * Pre-compute the per-row aggregate stats for an `agentic_trace_replay` + * blob pair. The output lands in the `aggregate_stats` JSONB column so the + * detail page can serve the "Aggregates across configs" view and the + * derived chart x-axis modes from a single SQL row read, instead of + * parsing the raw blobs on demand. + * + * Shape is intentionally versioned — bump `STATS_VERSION` whenever the + * computation changes so the backfill script knows which rows to recompute. + */ + +import { Readable } from 'node:stream'; +import { createGunzip, gunzipSync } from 'node:zlib'; + +import { chain } from 'stream-chain'; + +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick.js'; +import { streamObject } from 'stream-json/streamers/stream-object.js'; + +import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics.js'; +import { + STATS_VERSION, + extractIslOsl, + extractServerMetricSamples, + percentilesOf, + type MetricPercentiles, +} from '../queries/agentic-aggregates.js'; + +export { STATS_VERSION }; + +export interface AggregateStats { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */ + normalizedSessionTimeS: number | null; + /** P90 of per-turn ISL/TTFT pooled across every session's turns. */ + p90PrefillTpsPerUser: number | null; +} + +/** Metric subtrees we extract via stream-parse on oversized server blobs. */ +const TARGET_METRIC_KEYS = new Set([ + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_hits', + 'vllm:gpu_prefix_cache_queries', +]); + +/** + * Stream-parse the gzipped server_metrics_json and collect just the metric + * subtrees we care about. Avoids Node's 512 MB max-string-length cap that + * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows. + */ +async function streamExtractServer( + buffer: Buffer, +): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> { + /* eslint-disable @typescript-eslint/no-explicit-any */ + const collected: Record = {}; + const pipelineStream = chain([ + Readable.from(buffer), + createGunzip(), + parser(), + pick({ filter: 'metrics' }), + streamObject(), + ]); + await new Promise((resolve, reject) => { + (pipelineStream as any).on('data', (chunk: unknown) => { + const { key, value } = chunk as { key: string; value: unknown }; + if (TARGET_METRIC_KEYS.has(key)) collected[key] = value; + }); + (pipelineStream as any).on('end', resolve); + (pipelineStream as any).on('error', reject); + }); + /* eslint-enable @typescript-eslint/no-explicit-any */ + return extractServerMetricSamples(JSON.stringify({ metrics: collected })); +} + +/** + * Compute the full versioned stats bundle from a (profile, server-metrics) + * blob pair. Either blob may be null (e.g. only the server file existed) — + * the corresponding stats just come back null. + */ +export async function computeAggregateStats(args: { + profileBlob: Buffer | null; + serverBlob: Buffer | null; +}): Promise { + let islPct: MetricPercentiles | null = null; + let oslPct: MetricPercentiles | null = null; + let normalized: number | null = null; + let prefillP90: number | null = null; + + if (args.profileBlob) { + try { + const jsonl = gunzipSync(args.profileBlob).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + islPct = percentilesOf(isl); + oslPct = percentilesOf(osl); + const derived = computeDerivedFromBlob(jsonl); + normalized = derived.normalized_session_time_s; + prefillP90 = derived.p90_prefill_tps_per_user; + } catch { + // ignore malformed blob — leave nulls + } + } + + let kvPct: MetricPercentiles | null = null; + let prefixPct: MetricPercentiles | null = null; + if (args.serverBlob) { + let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null; + try { + const json = gunzipSync(args.serverBlob).toString('utf8'); + server = extractServerMetricSamples(json); + } catch (error) { + const code = error && (error as NodeJS.ErrnoException).code; + const msg = error instanceof Error ? error.message : String(error); + // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to + // pull just the metric subtrees we need without materializing the + // full 500+ MB JSON string. + if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) { + try { + server = await streamExtractServer(args.serverBlob); + } catch { + // stream fallback failed too — leave nulls + } + } + } + if (server) { + kvPct = percentilesOf(server.kvCacheUtil); + prefixPct = percentilesOf(server.prefixCacheHitRate); + } + } + + return { + version: STATS_VERSION, + isl: islPct, + osl: oslPct, + kvCacheUtil: kvPct, + prefixCacheHitRate: prefixPct, + normalizedSessionTimeS: normalized, + p90PrefillTpsPerUser: prefillP90, + }; +} diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts new file mode 100644 index 00000000..4c6f8791 --- /dev/null +++ b/packages/db/src/etl/compute-chart-series.test.ts @@ -0,0 +1,209 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js'; + +/** + * Build a minimal server_metrics_json blob covering the metrics the chart + * consumes. Each timeslice is one second long starting at t=0. + */ +function makeBlob(opts?: { + prefixHits?: number; + prefixQueries?: number; + promptTokensRate?: number; +}) { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1e9, avg: 0.1 }, + { start_ns: 1e9, end_ns: 2e9, avg: 0.4 }, + { start_ns: 2e9, end_ns: 3e9, avg: 0.7 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }], + }, + 'vllm:prefix_cache_queries': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }], + }, + 'vllm:num_requests_running': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }], + }, + 'vllm:num_requests_waiting': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }], + }, + 'vllm:prompt_tokens': { + series: [ + { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] }, + ], + }, + 'vllm:generation_tokens': { + series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }], + }, + 'vllm:prompt_tokens_by_source': { + series: [ + { + labels: { source: 'local_cache_hit' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }], + }, + { + labels: { source: 'miss' }, + timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }], + }, + ], + }, + }, + }); + return gzipSync(Buffer.from(json)); +} + +/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */ +function buildEngineSeries(engineId: number, baseRunning: number) { + const labels = { engine: String(engineId) }; + return { + runningSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: baseRunning }, + { start_ns: 1e9, avg: baseRunning + 1 }, + ], + }, + waitingSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: 0 }, + { start_ns: 1e9, avg: 0 }, + ], + }, + kvSlice: { + labels, + timeslices: [ + { start_ns: 0, avg: 0.25 }, + { start_ns: 1e9, avg: 0.5 }, + ], + }, + promptSlice: { + labels, + timeslices: [ + { start_ns: 0, rate: 100 }, + { start_ns: 1e9, rate: 200 }, + ], + }, + genSlice: { + labels, + timeslices: [ + { start_ns: 0, rate: 50 }, + { start_ns: 1e9, rate: 75 }, + ], + }, + }; +} + +describe('computeChartSeries', () => { + it('returns null when the blob is null', async () => { + expect(await computeChartSeries(null)).toBeNull(); + }); + + it('returns the current CHART_SERIES_VERSION in the bundle', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.version).toBe(CHART_SERIES_VERSION); + }); + + it('extracts kvCacheUsage points with t=seconds-from-start', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.kvCacheUsage).toEqual([ + { t: 0, value: 0.1 }, + { t: 1, value: 0.4 }, + { t: 2, value: 0.7 }, + ]); + }); + + it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => { + const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 })); + expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]); + }); + + it('drops prefixCacheHitRate windows where queries.rate is 0', async () => { + const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 })); + expect(series?.prefixCacheHitRate).toEqual([]); + }); + + it('pairs running + waiting into queueDepth points', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]); + }); + + it('extracts prefillTps + decodeTps from counter rates', async () => { + const series = await computeChartSeries(makeBlob()); + expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]); + expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]); + }); + + it('splits promptTokensBySource by label and skips empty series', async () => { + const series = await computeChartSeries(makeBlob()); + expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([ + 'local_cache_hit', + 'miss', + ]); + expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]); + expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]); + }); + + it('computes timing metadata from the widest metric window', async () => { + const series = await computeChartSeries(makeBlob()); + // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9. + expect(series?.startNs).toBe(0); + expect(series?.endNs).toBe(3e9); + expect(series?.durationS).toBeCloseTo(3, 6); + expect(series?.timeslicesCount).toBe(3); + }); + + it('returns null on a malformed (non-gzip) blob', async () => { + const result = await computeChartSeries(Buffer.from('not-gzip-data')); + expect(result).toBeNull(); + }); + + it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => { + // Simulate a 4-engine deployment: each engine reports its own series for + // every metric. Cluster-wide value should be SUM for running/waiting and + // counter rates, AVG for kv_cache_usage_perc (per-engine fraction). + const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine + const json = JSON.stringify({ + metrics: { + 'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) }, + 'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) }, + 'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) }, + 'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) }, + 'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) }, + }, + }); + const blob = gzipSync(Buffer.from(json)); + const cs = await computeChartSeries(blob); + expect(cs).not.toBeNull(); + // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1 + expect(cs!.queueDepth).toEqual([ + { t: 0, running: 12, waiting: 0, total: 12 }, + { t: 1, running: 16, waiting: 0, total: 16 }, + ]); + // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value) + expect(cs!.kvCacheUsage).toEqual([ + { t: 0, value: 0.25 }, + { t: 1, value: 0.5 }, + ]); + // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800 + expect(cs!.prefillTps).toEqual([ + { t: 0, value: 400 }, + { t: 1, value: 800 }, + ]); + expect(cs!.decodeTps).toEqual([ + { t: 0, value: 200 }, + { t: 1, value: 300 }, + ]); + }); +}); diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts new file mode 100644 index 00000000..530600cf --- /dev/null +++ b/packages/db/src/etl/compute-chart-series.ts @@ -0,0 +1,290 @@ +/** + * Pre-compute the time-series for the agentic detail page chart, so the + * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every + * request. The output lands in `agentic_trace_replay.chart_series` and is + * read directly by `getTraceServerMetrics`. + * + * Versioned so the backfill script knows which rows are stale — bump + * `CHART_SERIES_VERSION` whenever the extraction algorithm changes. + */ + +import { Readable } from 'node:stream'; +import { createGunzip, gunzipSync } from 'node:zlib'; + +import { chain } from 'stream-chain'; + +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick.js'; +import { streamObject } from 'stream-json/streamers/stream-object.js'; + +/** + * Bump when the extraction algorithm changes — backfill recomputes anything + * older. + * + * v2: aggregate vllm gauges/counters across all engine series (was reading + * only series[0], which under-counted by Nx on multi-engine DP/PP + * deployments — most visible as a request-queue-depth chart that maxed out + * at ~3 when the timeline clearly showed 20+ in-flight). + */ +export const CHART_SERIES_VERSION = 2; + +export interface TimeSeriesPoint { + /** Seconds from benchmark start. */ + t: number; + value: number; +} + +export interface QueueDepthPoint { + t: number; + running: number; + waiting: number; + total: number; +} + +export interface ChartSeries { + version: number; + /** ns wall-clock of the first window's start; for debugging only. */ + startNs: number; + /** ns wall-clock of the last window's end. */ + endNs: number; + /** Total benchmark window in seconds. */ + durationS: number; + /** Number of 1Hz windows captured. */ + timeslicesCount: number; + kvCacheUsage: TimeSeriesPoint[]; + prefixCacheHitRate: TimeSeriesPoint[]; + queueDepth: QueueDepthPoint[]; + promptTokensBySource: Record; + prefillTps: TimeSeriesPoint[]; + decodeTps: TimeSeriesPoint[]; +} + +// ── Raw blob shapes (subset we read) ──────────────────────────────────── + +interface RawSlice { + start_ns?: number; + end_ns?: number; + avg?: number; + rate?: number; +} + +interface RawSeries { + labels?: Record; + timeslices?: RawSlice[]; +} + +interface RawMetric { + series?: RawSeries[]; +} + +type MetricsMap = Record; + +/** The set of metric subtrees the chart consumes. */ +const CHART_METRIC_KEYS = new Set([ + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:num_requests_running', + 'vllm:num_requests_waiting', + 'vllm:prompt_tokens', + 'vllm:generation_tokens', + 'vllm:prompt_tokens_by_source', +]); + +/** + * Stream-parse the gzipped server_metrics_json and collect only the metric + * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that + * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows. + */ +async function streamCollectMetrics(buffer: Buffer): Promise { + /* eslint-disable @typescript-eslint/no-explicit-any */ + const collected: MetricsMap = {}; + const pipeline = chain([ + Readable.from(buffer), + createGunzip(), + parser(), + pick({ filter: 'metrics' }), + streamObject(), + ]); + await new Promise((resolve, reject) => { + (pipeline as any).on('data', (chunk: unknown) => { + const { key, value } = chunk as { key: string; value: RawMetric }; + if (CHART_METRIC_KEYS.has(key)) collected[key] = value; + }); + (pipeline as any).on('end', resolve); + (pipeline as any).on('error', reject); + }); + /* eslint-enable @typescript-eslint/no-explicit-any */ + return collected; +} + +/** + * Parse the gzipped server_metrics blob into the metric map. Tries the + * synchronous fast path first; falls back to stream-parse on + * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed. + */ +async function parseMetrics(buffer: Buffer): Promise { + try { + const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap }; + return obj.metrics ?? {}; + } catch (error) { + const code = error && (error as NodeJS.ErrnoException).code; + const msg = error instanceof Error ? error.message : String(error); + if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) { + return await streamCollectMetrics(buffer); + } + throw error; + } +} + +/** + * Build chart-ready time-series arrays from a gzipped server_metrics blob. + * The math mirrors `getTraceServerMetrics` — this helper exists so ingest, + * backfill, and the API path produce byte-identical results. + */ +export async function computeChartSeries(blob: Buffer | null): Promise { + if (!blob) return null; + let metrics: MetricsMap; + try { + metrics = await parseMetrics(blob); + } catch { + // Malformed blob → no series (caller treats null as "no data"). + return null; + } + return buildSeriesFromMetrics(metrics); +} + +/** + * Aggregate one timeslice field across all series of a metric, indexed by + * `start_ns`. Multi-engine vllm deployments report one series per engine — + * the cluster value is the sum (for running/waiting/throughput counters) + * or the average (for kv_cache_usage_perc, a per-engine fraction). + */ +function aggregateByStart( + series: readonly RawSeries[] | undefined, + field: 'avg' | 'rate', + combine: 'sum' | 'avg', +): Map { + const sums = new Map(); + const counts = new Map(); + for (const s of series ?? []) { + for (const ts of s.timeslices ?? []) { + if (typeof ts.start_ns !== 'number') continue; + const v = ts[field]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v); + counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1); + } + } + if (combine === 'sum') return sums; + const out = new Map(); + for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1)); + return out; +} + +/** Stable order: emit one point per unique start_ns, chronologically. */ +function sortedEntries(m: Map): [number, number][] { + return [...m.entries()].toSorted((a, b) => a[0] - b[0]); +} + +function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { + // Timing reference: smallest start_ns and largest end_ns across every + // timeslice we extracted. timeslicesCount is the length of any single + // series (engines are scraped on the same cadence), so picking the max + // length across all series of all metrics is safe. + let startNs = Number.POSITIVE_INFINITY; + let endNs = 0; + let timeslicesCount = 0; + for (const metricMeta of Object.values(metrics)) { + for (const s of metricMeta?.series ?? []) { + const ts = s.timeslices ?? []; + if (ts.length === 0) continue; + timeslicesCount = Math.max(timeslicesCount, ts.length); + const first = ts[0]!; + const last = ts.at(-1)!; + if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns; + if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns; + } + } + if (!Number.isFinite(startNs)) startNs = 0; + const tOf = (ns: number) => (ns - startNs) / 1e9; + + // KV cache usage (gauge, 0..1) — average across engines so the value + // stays a fraction (each engine has its own KV pool). + const kvSeries = + metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvCacheUsage: TimeSeriesPoint[] = sortedEntries( + aggregateByStart(kvSeries, 'avg', 'avg'), + ).map(([t, v]) => ({ t: tOf(t), value: v })); + + // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across + // engines, joined on start_ns. + const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum'); + const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum'); + const prefixCacheHitRate: TimeSeriesPoint[] = []; + for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) { + const q = qsByT.get(t); + if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q }); + } + + // Queue depth: sum running + waiting across engines per timeslice. + const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum'); + const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum'); + const queueDepth: QueueDepthPoint[] = []; + // Union of timestamps so we surface activity even if one of the gauges + // didn't report a sample on a given tick. + const allTimes = new Set([...runByT.keys(), ...waitByT.keys()]); + for (const t of [...allTimes].toSorted((a, b) => a - b)) { + const running = runByT.get(t) ?? 0; + const waiting = waitByT.get(t) ?? 0; + queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting }); + } + + // Throughput: sum the counter `rate` (already per-second) across engines. + const counterRate = (name: string): TimeSeriesPoint[] => + sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({ + t: tOf(t), + value: v, + })); + const prefillTps = counterRate('vllm:prompt_tokens'); + const decodeTps = counterRate('vllm:generation_tokens'); + + // Per-source prompt tokens — sum across engines per source label. + const promptBySrcByT = new Map>(); + for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) { + const labels = series.labels ?? {}; + const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels); + let byT = promptBySrcByT.get(source); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(source, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + const promptTokensBySource: Record = {}; + for (const [source, byT] of promptBySrcByT) { + const arr: TimeSeriesPoint[] = []; + for (const [t, v] of [...byT.entries()].toSorted((a, b) => a[0] - b[0])) { + if (v > 0) arr.push({ t: tOf(t), value: v }); + } + if (arr.length > 0) promptTokensBySource[source] = arr; + } + return { + version: CHART_SERIES_VERSION, + startNs, + endNs, + durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0, + timeslicesCount, + kvCacheUsage, + prefixCacheHitRate, + queueDepth, + promptTokensBySource, + prefillTps, + decodeTps, + }; +} diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts new file mode 100644 index 00000000..64512aca --- /dev/null +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -0,0 +1,153 @@ +import { gzipSync } from 'node:zlib'; + +import { describe, expect, it } from 'vitest'; + +import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js'; + +interface SyntheticRequest { + cid: string; + ti: number; + wid?: string; + ad?: number; + phase?: string; + credit: number; + start: number; + end: number; + ack?: number | null; + ttftMs?: number | null; + isl?: number | null; + osl?: number | null; + cancelled?: boolean; +} + +function makeBlob(requests: SyntheticRequest[]) { + const lines = requests.map((r) => + JSON.stringify({ + metadata: { + conversation_id: r.cid, + turn_index: r.ti, + worker_id: r.wid ?? 'worker_default', + agent_depth: r.ad ?? 0, + benchmark_phase: r.phase ?? 'profiling', + credit_issued_ns: r.credit, + request_start_ns: r.start, + ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }), + request_end_ns: r.end, + was_cancelled: r.cancelled ?? false, + }, + metrics: { + time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' }, + input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' }, + output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' }, + }, + }), + ); + return gzipSync(Buffer.from(lines.join('\n'))); +} + +describe('computeRequestTimeline', () => { + it('returns null when the blob is null', () => { + expect(computeRequestTimeline(null)).toBeNull(); + }); + + it('returns null on a malformed (non-gzip) blob', () => { + expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull(); + }); + + it('returns null when the blob has no parseable records', () => { + expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull(); + }); + + it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => { + const tl = computeRequestTimeline( + makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]), + ); + expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION); + }); + + it('shifts ns timestamps to be relative to the earliest credit_issued', () => { + // Two requests with absolute ns starting at 1_000_000_000. + const tl = computeRequestTimeline( + makeBlob([ + { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 }, + { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 }, + ]), + ); + expect(tl?.startNs).toBe(1_000_000_000); + expect(tl?.endNs).toBe(1_030_000_000); + expect(tl?.durationS).toBeCloseTo(0.03, 6); + expect(tl?.requests[0]?.credit).toBe(0); + expect(tl?.requests[0]?.end).toBe(10_000_000); + expect(tl?.requests[1]?.start).toBe(21_000_000); + }); + + it('sorts requests by start time, regardless of input order', () => { + const tl = computeRequestTimeline( + makeBlob([ + { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 }, + { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 }, + { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 }, + ]), + ); + expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]); + }); + + it('preserves conversation/worker grouping fields', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'conv-A', + ti: 5, + wid: 'worker_abcd1234', + ad: 2, + phase: 'profiling', + credit: 0, + start: 10, + end: 100, + }, + ]), + ); + const r = tl?.requests[0]!; + expect(r.cid).toBe('conv-A'); + expect(r.ti).toBe(5); + expect(r.wid).toBe('worker_abcd1234'); + expect(r.ad).toBe(2); + expect(r.phase).toBe('profiling'); + }); + + it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'a', + ti: 0, + credit: 0, + start: 10, + end: 100, + ttftMs: 25.5, + isl: 1024, + osl: 256, + cancelled: true, + }, + ]), + ); + const r = tl?.requests[0]!; + expect(r.cancelled).toBe(true); + expect(r.ttftMs).toBeCloseTo(25.5, 6); + expect(r.isl).toBe(1024); + expect(r.osl).toBe(256); + }); + + it('skips records missing both credit_issued_ns and request_start_ns', () => { + // Build a record with only request_end_ns — the helper rejects it. + const broken = gzipSync( + Buffer.from( + JSON.stringify({ + metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 }, + metrics: {}, + }), + ), + ); + expect(computeRequestTimeline(broken)).toBeNull(); + }); +}); diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts new file mode 100644 index 00000000..a1134f7a --- /dev/null +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -0,0 +1,182 @@ +/** + * Pre-compute the per-request timeline for the agentic detail page's + * Gantt view. Output lands in `agentic_trace_replay.request_timeline` + * and is read directly by the timeline API route. + * + * Shape is a thin array — ~150 bytes per request × ~200 requests per + * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw + * gzipped JSONL blob (~1-3 MB). + * + * Versioned so the backfill script knows which rows are stale — bump + * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes. + */ + +import { gunzipSync } from 'node:zlib'; + +/** Bump when the extraction algorithm changes — backfill recomputes anything older. */ +export const REQUEST_TIMELINE_VERSION = 1; + +export interface RequestRecord { + /** Conversation id (groups turns of one agent session). */ + cid: string; + /** Zero-based turn index within the conversation. */ + ti: number; + /** Worker id (concurrency slot that handled this request). */ + wid: string; + /** Sub-agent depth (0 = top-level). */ + ad: number; + /** `warmup` or `profiling`. */ + phase: string; + /** ns offset from timeline.startNs. Load gen decided to dispatch. */ + credit: number; + /** ns offset from timeline.startNs. HTTP send started. */ + start: number; + /** ns offset from timeline.startNs. First server acknowledgement (or null). */ + ack: number | null; + /** ns offset from timeline.startNs. Last byte received. */ + end: number; + /** Time-to-first-token in ms. */ + ttftMs: number | null; + /** Input sequence length (tokens). */ + isl: number | null; + /** Output sequence length (tokens). */ + osl: number | null; + cancelled: boolean; +} + +export interface RequestTimeline { + version: number; + /** Wall-clock ns of the earliest event (used as the relative-time origin). */ + startNs: number; + /** Wall-clock ns of the latest `request_end_ns`. */ + endNs: number; + /** Total span in seconds. */ + durationS: number; + requests: RequestRecord[]; +} + +interface RawMetadata { + conversation_id?: string; + turn_index?: number; + worker_id?: string; + agent_depth?: number; + benchmark_phase?: string; + credit_issued_ns?: number; + request_start_ns?: number; + request_ack_ns?: number; + request_end_ns?: number; + was_cancelled?: boolean; +} + +interface RawMetricValue { + value?: number; +} + +interface RawRecord { + metadata?: RawMetadata; + metrics?: { + time_to_first_token?: RawMetricValue | number; + input_sequence_length?: RawMetricValue | number; + output_sequence_length?: RawMetricValue | number; + }; +} + +/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */ +function readNum(v: unknown): number | undefined { + if (typeof v === 'number') return Number.isFinite(v) ? v : undefined; + if (v && typeof v === 'object' && 'value' in v) { + const inner = (v as { value?: unknown }).value; + if (typeof inner === 'number' && Number.isFinite(inner)) return inner; + } + return undefined; +} + +/** + * Parse the gzipped `profile_export.jsonl` blob into a chart-ready + * timeline. Returns null on a missing or malformed blob. + */ +export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null { + if (!blob) return null; + let text: string; + try { + text = gunzipSync(blob).toString('utf8'); + } catch { + return null; + } + + // First pass: parse + collect raw turns; find timeline origin. + const raw: { + meta: RawMetadata; + ttftMs: number | null; + isl: number | null; + osl: number | null; + }[] = []; + let originNs = Number.POSITIVE_INFINITY; + let endNs = 0; + + for (const line of text.split('\n')) { + if (!line) continue; + let rec: RawRecord; + try { + rec = JSON.parse(line) as RawRecord; + } catch { + continue; + } + const meta = rec.metadata ?? {}; + // Use credit_issued_ns when available (the true start of the request's + // lifecycle), falling back to request_start_ns. Skip rows missing both. + const cStart = meta.credit_issued_ns ?? meta.request_start_ns; + const cEnd = meta.request_end_ns; + if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue; + + if (cStart < originNs) originNs = cStart; + if (cEnd > endNs) endNs = cEnd; + + raw.push({ + meta, + ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null, + isl: readNum(rec.metrics?.input_sequence_length) ?? null, + osl: readNum(rec.metrics?.output_sequence_length) ?? null, + }); + } + + if (raw.length === 0) return null; + if (!Number.isFinite(originNs)) originNs = 0; + + // Second pass: shift timestamps to be relative to originNs (smaller + // numbers fit in JSON nicely and the frontend doesn't need bigint math). + const requests: RequestRecord[] = []; + for (const r of raw) { + const m = r.meta; + const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs; + const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs; + const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null; + const end = (m.request_end_ns ?? originNs) - originNs; + requests.push({ + cid: m.conversation_id ?? 'unknown', + ti: typeof m.turn_index === 'number' ? m.turn_index : 0, + wid: m.worker_id ?? 'unknown', + ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0, + phase: m.benchmark_phase ?? 'unknown', + credit, + start, + ack, + end, + ttftMs: r.ttftMs, + isl: r.isl, + osl: r.osl, + cancelled: m.was_cancelled === true, + }); + } + + // Stable order so backfill output is deterministic. + requests.sort((a, b) => a.start - b.start); + + return { + version: REQUEST_TIMELINE_VERSION, + startNs: originNs, + endNs, + durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0, + requests, + }; +} diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index d42429c9..0e1166aa 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -22,6 +22,8 @@ export { GPU_KEYS }; * stripped base is not in `GPU_KEYS`. */ export function hwToGpuKey(hw: string): string | null { + // Take the first segment before `-` as the canonical key. Subsumes all the + // prior explicit suffix strips (-nv, -amds, -dgxc-slurm, -p1, -cw, …). const base = hw.toLowerCase().split('-')[0]; return GPU_KEYS.has(base) ? base : null; } diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts index 90ad73b7..e407db3a 100644 --- a/packages/db/src/etl/skip-tracker.test.ts +++ b/packages/db/src/etl/skip-tracker.test.ts @@ -9,6 +9,7 @@ describe('createSkipTracker', () => { expect(tracker.skips.unmappedHw).toBe(0); expect(tracker.skips.noIslOsl).toBe(0); expect(tracker.skips.dbError).toBe(0); + expect(tracker.skips.traceReplayMissing).toBe(0); }); it('initializes with empty unmapped sets', () => { diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts index 6166ea44..401d197c 100644 --- a/packages/db/src/etl/skip-tracker.ts +++ b/packages/db/src/etl/skip-tracker.ts @@ -8,7 +8,10 @@ export interface Skips { unmappedModel: number; unmappedHw: number; noIslOsl: number; + failedRun: number; dbError: number; + /** Agentic point whose sibling `agentic_` artifact had no trace_replay files. */ + traceReplayMissing: number; } export interface SkipSnapshot { @@ -66,7 +69,15 @@ const MAX_DB_ERRORS = 10; * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets. */ export function createSkipTracker(): SkipTracker { - const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 }; + const skips: Skips = { + badZip: 0, + unmappedModel: 0, + unmappedHw: 0, + noIslOsl: 0, + failedRun: 0, + dbError: 0, + traceReplayMissing: 0, + }; const unmappedModels = new Set(); const unmappedHws = new Set(); const unmappedPrecisions = new Set(); diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts new file mode 100644 index 00000000..8cc03f2a --- /dev/null +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -0,0 +1,103 @@ +/** + * Insert per-point aiperf trace files (`profile_export.jsonl` + + * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row + * to each provided benchmark_results row via `trace_replay_id`. + * + * Mirrors the {@link insertServerLog} idempotency contract: rows that already + * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't + * duplicate the sibling blob. + */ + +import { gzipSync } from 'node:zlib'; + +import type postgres from 'postgres'; + +import { computeAggregateStats } from './compute-aggregate-stats.js'; +import { computeChartSeries } from './compute-chart-series.js'; +import { computeRequestTimeline } from './compute-request-timeline.js'; + +type Sql = ReturnType; + +/** + * Persist the per-point trace files and link them to `benchmarkResultIds`. + * + * @param sql Active `postgres` connection. + * @param benchmarkResultIds DB ids of the benchmark_results rows produced by + * the same `bmk_agentic_` artifact whose + * sibling `agentic_` directory holds these + * trace files. + * @param profileExportJsonl Raw bytes of `profile_export.jsonl`, or null. + * Gzipped before storage. + * @param serverMetricsCsv Raw bytes of `server_metrics_export.csv`, or null. + * Stored as-is. + * @param serverMetricsJson Raw bytes of `server_metrics_export.json` — + * per-scrape time-series of every Prometheus metric. + * Optional, gzipped before storage (~42x ratio). + */ +export async function insertTraceReplay( + sql: Sql, + benchmarkResultIds: number[], + profileExportJsonl: Buffer | null, + serverMetricsCsv: Buffer | null, + serverMetricsJson: Buffer | null = null, +): Promise { + if (benchmarkResultIds.length === 0) return; + if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return; + + // Only link rows that don't already point at a trace_replay row — keeps + // re-ingest from inserting duplicate sibling blobs. + const unlinked = await sql<{ id: number }[]>` + select id from benchmark_results + where id = any(${sql.array(benchmarkResultIds)}::bigint[]) + and trace_replay_id is null + `; + if (unlinked.length === 0) return; + + const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null; + const profileSize = profileExportJsonl ? profileExportJsonl.length : null; + const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null; + const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null; + const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null; + + // Pre-compute aggregate stats + chart-ready time-series + per-request + // timeline so the detail page doesn't have to re-parse these blobs on + // every request. Each helper tolerates a null blob and falls back to + // a streaming parser for oversized server_metrics blobs. + const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([ + computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }), + computeChartSeries(metricsJsonGz), + Promise.resolve(computeRequestTimeline(profileGz)), + ]); + + const [{ id: traceReplayId }] = await sql<{ id: number }[]>` + insert into agentic_trace_replay ( + profile_export_jsonl_gz, + profile_export_uncompressed_size, + server_metrics_csv, + server_metrics_csv_size, + server_metrics_json_gz, + server_metrics_json_uncompressed_size, + aggregate_stats, + chart_series, + request_timeline + ) + values ( + ${profileGz}, + ${profileSize}, + ${serverMetricsCsv}, + ${csvSize}, + ${metricsJsonGz}, + ${metricsJsonSize}, + ${sql.json(structuredClone(aggregateStats) as unknown as Parameters[0])}, + ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters[0])}, + ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters[0])} + ) + returning id + `; + + await sql` + update benchmark_results + set trace_replay_id = ${traceReplayId} + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + `; +} diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index c345e662..eeb55313 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -45,6 +45,7 @@ import { bulkUpsertAvailability, insertServerLog, } from './etl/benchmark-ingest'; +import { insertTraceReplay } from './etl/trace-replay-ingest'; import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper'; import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; @@ -109,15 +110,30 @@ if (isDownloadMode) { } catch {} } - const byName = new Map(); + // Strip the trailing `__` token from each + // artifact name, then group by the resulting logical name and keep only + // the most recent per group. Without this, two artifacts produced on + // different runners for the same logical config (e.g. `…_h200-cw_00` and + // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty + // metrics can overwrite the good one via ON CONFLICT DO UPDATE. + // + // The runner pool name itself has no underscores (`h200-cw`, + // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip + // bounded — using `\w` here would over-match across earlier `_` + // separators and collapse different (conc, offload) variants into the + // same logical name. + const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/; + const byLogical = new Map(); for (const a of allArtifacts) { - const existing = byName.get(a.name); + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); if (!existing || a.created_at > existing.created_at) { - byName.set(a.name, a); + byLogical.set(key, a); } } - for (const [name, artifact] of byName) { + for (const [, artifact] of byLogical) { + const name = artifact.name; console.log(` ${name}`); const zipPath = path.join(artifactsDir, 'artifact.zip'); execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { @@ -129,7 +145,7 @@ if (isDownloadMode) { fs.unlinkSync(zipPath); } - console.log(`\n Downloaded ${byName.size} artifact(s)`); + console.log(`\n Downloaded ${byLogical.size} artifact(s)`); // Fetch run attempt from API const attemptStr = execSync( @@ -194,6 +210,14 @@ const ARTIFACT_NAMES = { changelog: 'changelog-metadata', } as const; +/** + * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name + * so the bare suffix becomes a shared key between `bmk_agentic_` and + * its sibling `agentic_` artifact. + */ +const stripBmkAndAgenticPrefix = (s: string): string => + s.replace(/^bmk_/u, '').replace(/^agentic_/u, ''); + function readJson(filePath: string): unknown { try { return JSON.parse(fs.readFileSync(filePath, 'utf8')); @@ -294,13 +318,14 @@ async function main(): Promise { const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; let totalNewBmk = 0, @@ -311,6 +336,7 @@ async function main(): Promise { let totalSamples = 0; let totalSampleFiles = 0; let totalChangelogs = 0; + let totalTraceReplayLinked = 0; // ── Check for evals-only flag in changelog ──────────────────────────── const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog); @@ -365,6 +391,56 @@ async function main(): Promise { console.log(` Found ${serverLogPaths.size} server log artifact(s)`); } + // Sibling aiperf artifacts: each `bmk_agentic_` is paired with an + // `agentic_` dir holding `profile_export.jsonl` and + // `server_metrics_export.csv`. The harness emits these under either a + // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current). + // Older non-aiperf agentic runs don't ship this sibling. Key on the bare + // suffix so both names map to the same Map entry. + const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay']; + const traceReplayPaths = new Map< + string, + { + profileJsonl: string | null; + serverMetricsCsv: string | null; + serverMetricsJson: string | null; + } + >(); + if (fs.existsSync(artifactsDir)) { + for (const d of fs.readdirSync(artifactsDir)) { + if (!d.startsWith('agentic_')) continue; + let profile: string | null = null; + let metrics: string | null = null; + let metricsJson: string | null = null; + for (const sub of TRACE_SUBDIRS) { + const dir = path.join(artifactsDir, d, sub); + if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue; + if (!profile) { + const p = path.join(dir, 'profile_export.jsonl'); + if (fs.existsSync(p)) profile = p; + } + if (!metrics) { + const m = path.join(dir, 'server_metrics_export.csv'); + if (fs.existsSync(m)) metrics = m; + } + if (!metricsJson) { + const j = path.join(dir, 'server_metrics_export.json'); + if (fs.existsSync(j)) metricsJson = j; + } + } + if (!profile && !metrics && !metricsJson) continue; + const suffix = stripBmkAndAgenticPrefix(d); + traceReplayPaths.set(suffix, { + profileJsonl: profile, + serverMetricsCsv: metrics, + serverMetricsJson: metricsJson, + }); + } + } + if (traceReplayPaths.size > 0) { + console.log(` Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`); + } + const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))]; console.log(` Found ${allBmkFiles.length} benchmark JSON file(s)`); @@ -415,6 +491,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } @@ -431,12 +508,42 @@ async function main(): Promise { } } } + + // Trace-replay sibling lookup for agentic points only. The aiperf + // harness emits `agentic_/trace_replay/...` next to the + // `bmk_agentic_` artifact we just ingested. + if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) { + const suffix = stripBmkAndAgenticPrefix(parentDir); + const trace = traceReplayPaths.get(suffix); + if (trace) { + try { + const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null; + const metrics = trace.serverMetricsCsv + ? fs.readFileSync(trace.serverMetricsCsv) + : null; + const metricsJson = trace.serverMetricsJson + ? fs.readFileSync(trace.serverMetricsJson) + : null; + await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson); + totalTraceReplayLinked += insertedIds.length; + } catch (error: any) { + tracker.recordDbError(`trace_replay for ${suffix}`, error); + } + } else { + tracker.skips.traceReplayMissing++; + } + } } catch (error: any) { tracker.recordDbError(path.basename(file), error); } } } console.log(` Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`); + if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) { + console.log( + ` Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`, + ); + } if (availRows.length > 0) { try { @@ -654,11 +761,17 @@ async function main(): Promise { const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker; const totalSkips = - skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError; + skips.badZip + + skips.unmappedModel + + skips.unmappedHw + + skips.noIslOsl + + skips.failedRun + + skips.dbError; if (totalSkips > 0) { console.log(`\n Skipped: ${totalSkips} rows`); const skipLines: [string, number][] = [ ['no isl/osl (old format)', skips.noIslOsl], + ['failed run (0 successful)', skips.failedRun], ['unmapped model', skips.unmappedModel], ['unmapped hw', skips.unmappedHw], ['bad/empty zip', skips.badZip], diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index 9c17bfaf..b4a6fb95 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -457,6 +457,9 @@ async function mapWorkflowDir( unmappedModel: local.skips.unmappedModel, unmappedHw: local.skips.unmappedHw, noIslOsl: local.skips.noIslOsl, + failedRun: local.skips.failedRun, + // GCS backup doesn't ingest aiperf trace files; counter stays 0. + traceReplayMissing: local.skips.traceReplayMissing, }, localUnmappedModels: new Set(local.unmappedModels), localUnmappedHws: new Set(local.unmappedHws), @@ -621,13 +624,14 @@ async function main(): Promise { // Upsert availability rows only for successfully resolved configs const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const r of allInserted) { availRows.push({ @@ -639,6 +643,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts index a3b62fe0..f868767e 100644 --- a/packages/db/src/ingest-supplemental.ts +++ b/packages/db/src/ingest-supplemental.ts @@ -219,8 +219,10 @@ async function ingestSupplementalBmk( const rows: { configId: number; - isl: number; - osl: number; + benchmarkType: 'single_turn' | 'agentic_traces'; + offloadMode: string; + isl: number | null; + osl: number | null; conc: number; image: string | null; metrics: Record; @@ -271,6 +273,8 @@ async function ingestSupplementalBmk( rows.push({ configId, + benchmarkType: 'single_turn', + offloadMode: 'off', isl: entry.isl, osl: entry.osl, conc: entry.conc, @@ -294,13 +298,14 @@ async function ingestSupplementalBmk( // to `rows` are exactly the valid ones. const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const entry of entries) { const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined }); @@ -317,6 +322,7 @@ async function ingestSupplementalBmk( framework, specMethod, disagg, + benchmarkType: 'single_turn', }); } if (availRows.length > 0) { diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 25525e04..785d82c4 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -273,6 +273,7 @@ function toBenchmarkRow( metrics?: Record, ): BenchmarkRow { return { + id: br.id, hardware: c.hardware, framework: c.framework, model: c.model, @@ -290,6 +291,8 @@ function toBenchmarkRow( decode_num_workers: c.decode_num_workers, num_prefill_gpu: c.num_prefill_gpu, num_decode_gpu: c.num_decode_gpu, + benchmark_type: br.benchmark_type ?? 'single_turn', + offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off', isl: br.isl, osl: br.osl, conc: br.conc, @@ -410,7 +413,11 @@ export function getAvailabilityData(): AvailabilityRow[] { for (const a of s.availability) { const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`; if (validKeys.has(key)) { - rows.push({ ...a, date: toDateString(a.date) }); + rows.push({ + ...a, + benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn', + date: toDateString(a.date), + }); } } diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts new file mode 100644 index 00000000..8c712323 --- /dev/null +++ b/packages/db/src/queries/agentic-aggregates.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, it } from 'vitest'; + +import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates.js'; + +describe('percentilesOf', () => { + it('returns null for empty input', () => { + expect(percentilesOf([])).toBeNull(); + expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull(); + }); + + it('computes percentiles for a simple integer range', () => { + // 1..100, evenly spaced — linear quantile is straightforward. + const xs = Array.from({ length: 100 }, (_, i) => i + 1); + const p = percentilesOf(xs); + expect(p).not.toBeNull(); + expect(p!.n).toBe(100); + expect(p!.mean).toBeCloseTo(50.5, 6); + expect(p!.p50).toBeCloseTo(50.5, 6); + // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp. + expect(p!.p75).toBeCloseTo(75.25, 6); + expect(p!.p90).toBeCloseTo(90.1, 6); + expect(p!.p99).toBeCloseTo(99.01, 6); + }); + + it('filters out non-finite values before computing', () => { + const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]); + expect(p?.n).toBe(4); + expect(p?.mean).toBeCloseTo(2.5, 6); + }); +}); + +describe('extractIslOsl', () => { + it('reads input/output sequence length from profiling records', () => { + const lines = [ + JSON.stringify({ + metadata: { benchmark_phase: 'profiling' }, + metrics: { + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + JSON.stringify({ + metadata: { benchmark_phase: 'profiling' }, + metrics: { + input_sequence_length: { value: 200, unit: 'tokens' }, + output_sequence_length: { value: 75, unit: 'tokens' }, + }, + }), + // warmup record — should be ignored + JSON.stringify({ + metadata: { benchmark_phase: 'warmup' }, + metrics: { + input_sequence_length: { value: 9999, unit: 'tokens' }, + output_sequence_length: { value: 9999, unit: 'tokens' }, + }, + }), + ]; + const { isl, osl } = extractIslOsl(lines.join('\n')); + expect(isl).toEqual([100, 200]); + expect(osl).toEqual([50, 75]); + }); +}); + +describe('extractServerMetricSamples', () => { + it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => { + const json = JSON.stringify({ + metrics: { + 'vllm:kv_cache_usage_perc': { + series: [ + { + timeslices: [ + { start_ns: 0, end_ns: 1, avg: 0.1 }, + { start_ns: 1, end_ns: 2, avg: 0.5 }, + { start_ns: 2, end_ns: 3, avg: 0.9 }, + ], + }, + ], + }, + 'vllm:prefix_cache_hits': { + series: [ + { + timeslices: [ + { start_ns: 0, rate: 80 }, + { start_ns: 1, rate: 50 }, + { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0 + ], + }, + ], + }, + 'vllm:prefix_cache_queries': { + series: [ + { + timeslices: [ + { start_ns: 0, rate: 100 }, // hit rate = 0.8 + { start_ns: 1, rate: 100 }, // hit rate = 0.5 + { start_ns: 2, rate: 0 }, + ], + }, + ], + }, + }, + }); + const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json); + expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]); + expect(prefixCacheHitRate).toEqual([0.8, 0.5]); + }); + + it('returns empty arrays when the JSON lacks the expected metric series', () => { + const out = extractServerMetricSamples(JSON.stringify({ metrics: {} })); + expect(out.kvCacheUtil).toEqual([]); + expect(out.prefixCacheHitRate).toEqual([]); + }); +}); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts new file mode 100644 index 00000000..1ad7fd7f --- /dev/null +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -0,0 +1,421 @@ +/** + * Per-id aggregate stats for the "Aggregates across configs" view on the + * agentic detail page. Each id contributes one summary number per metric per + * percentile so the frontend can plot how each metric varies across the + * SKU's parallelism + concurrency configs. + * + * Sources: + * - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase) + * - `server_metrics_json` → time-series of KV cache utilization + + * prefix-cache hit rate per scrape interval + * + * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing + * or has no usable samples — frontend treats those as "no data". + */ + +import { Readable } from 'node:stream'; +import { createGunzip, gunzipSync } from 'node:zlib'; + +import { chain } from 'stream-chain'; + +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick.js'; +import { streamObject } from 'stream-json/streamers/stream-object.js'; + +import type { DbClient } from '../connection.js'; + +/** + * Bump when the aggregate-stats computation algorithm changes — the backfill + * script recomputes any row whose stored `aggregate_stats.version` is older. + * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular + * import: the compute helper depends on the percentile utilities below. + * + * v2: aggregate vllm gauges/counters across all engine series (was reading + * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + */ +export const STATS_VERSION = 2; + +export interface MetricPercentiles { + mean: number; + p50: number; + p75: number; + p90: number; + p99: number; + /** Sample count used to compute the percentiles. */ + n: number; +} + +export interface AgenticAggregate { + id: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; +} + +export type AgenticAggregateMap = Record; + +/** + * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per + * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed + * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire + * size, so two of those = ~50 MB and three already trips the 64 MB cap). + * We fetch the two blob types in separate queries with different chunk + * sizes. + */ +const PROFILE_CHUNK_SIZE = 8; +const SERVER_CHUNK_SIZE = 1; + +/** Linear-interpolated percentile (matches numpy default). */ +function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 0) return Number.NaN; + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +function meanOf(xs: number[]): number { + let s = 0; + for (const x of xs) s += x; + return s / xs.length; +} + +/** Compute the percentile bundle for an array of samples; null if empty. */ +export function percentilesOf(samples: number[]): MetricPercentiles | null { + const clean = samples.filter((v) => Number.isFinite(v)); + if (clean.length === 0) return null; + const sorted = [...clean].toSorted((a, b) => a - b); + return { + mean: meanOf(sorted), + p50: quantile(sorted, 0.5), + p75: quantile(sorted, 0.75), + p90: quantile(sorted, 0.9), + p99: quantile(sorted, 0.99), + n: sorted.length, + }; +} + +/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */ +function readNum(v: unknown): number | undefined { + if (typeof v === 'number') return v; + if (v && typeof v === 'object' && 'value' in v) { + const inner = (v as { value?: unknown }).value; + if (typeof inner === 'number' && Number.isFinite(inner)) return inner; + } + return undefined; +} + +interface ProfileRecord { + metadata?: { benchmark_phase?: string }; + metrics?: { + input_sequence_length?: { value?: number } | number; + output_sequence_length?: { value?: number } | number; + }; +} + +/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */ +export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } { + const isl: number[] = []; + const osl: number[] = []; + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: ProfileRecord; + try { + rec = JSON.parse(line) as ProfileRecord; + } catch { + continue; + } + if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; + const m = rec.metrics ?? {}; + const i = readNum(m.input_sequence_length); + const o = readNum(m.output_sequence_length); + if (typeof i === 'number') isl.push(i); + if (typeof o === 'number') osl.push(o); + } + return { isl, osl }; +} + +interface TimeSlice { + start_ns?: number; + end_ns?: number; + avg?: number; + rate?: number; + count?: number; + sum?: number; +} +interface Series { + labels?: Record; + timeslices?: TimeSlice[]; +} +interface MetricMeta { + series?: Series[]; +} +interface MetricsJson { + metrics?: Record; +} + +/** + * Aggregate a per-timeslice field across all series of a metric, indexed by + * the timeslice's `start_ns`. vllm reports one series per engine on + * multi-engine DP/PP deployments, so we sum (or average) across engines to + * get the cluster-wide value at each timeslice. + * + * `field` selects which numeric field on a timeslice to read (`avg` for + * gauges, `rate` for counter deltas). `combine` controls cross-engine math: + * 'sum' for running/waiting/throughput counters where the cluster total is + * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per + * engine and should be averaged across engines for the cluster view. + */ +function aggregateSeriesByStart( + metricSeries: readonly Series[] | undefined, + field: 'avg' | 'rate', + combine: 'sum' | 'avg', +): Map { + const sums = new Map(); + const counts = new Map(); + for (const s of metricSeries ?? []) { + for (const ts of s.timeslices ?? []) { + if (typeof ts.start_ns !== 'number') continue; + const v = ts[field]; + if (typeof v !== 'number' || !Number.isFinite(v)) continue; + sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v); + counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1); + } + } + if (combine === 'sum') return sums; + const out = new Map(); + for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1)); + return out; +} + +/** + * Parse the server_metrics_json → time-series arrays for KV cache util and + * prefix cache hit rate (per-interval, computed from the prometheus + * counters the same way trace-server-metrics does it). + * + * Aggregates across all engine series so multi-engine DP/PP deployments are + * counted correctly (previously we only read engine 0). + */ +export function extractServerMetricSamples(json: string): { + kvCacheUtil: number[]; + prefixCacheHitRate: number[]; +} { + const parsed = JSON.parse(json) as MetricsJson; + const metrics = parsed.metrics ?? {}; + + // KV cache util — per-engine gauge in [0, 1]. Average across engines so the + // value stays a percentage; summing would give meaningless 0..N. + const kvSeriesAll = + metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()]; + + // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across + // all engines. Sum first, then divide. + const hitsAll = + metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series; + const queriesAll = + metrics['vllm:prefix_cache_queries']?.series ?? + metrics['vllm:gpu_prefix_cache_queries']?.series; + const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum'); + const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum'); + const prefixCacheHitRate: number[] = []; + for (const [t, h] of hitsByT) { + const q = qByT.get(t); + if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q); + } + + return { kvCacheUtil, prefixCacheHitRate }; +} + +/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */ +const TARGET_METRIC_KEYS = new Set([ + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', // older fallback name + 'vllm:prefix_cache_hits', + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths) + 'vllm:gpu_prefix_cache_queries', +]); + +/** + * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics + * we need. Avoids the Node 512 MB string cap that JSON.parse hits on + * server_metrics blobs from high-conc TP+EP runs (which can decompress to + * >500 MB because vllm dumps `cache_config_info` every scrape interval). + * + * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') → + * StreamObject (one metric per chunk) → keep only the keys we care about. + * + * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the + * synchronous fast path so callers can use either interchangeably. + */ +async function streamExtractServerMetricSamples( + buffer: Buffer, +): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> { + const collected: Record = {}; + // stream-json's TypeScript types don't compose cleanly with node:stream's + // pipeline() generic, and several `.pipe()`/event APIs are typed loosely — + // cast to any for this local pipe chain. It works at runtime. + // stream-json composes transforms via stream-chain. `pick`/`streamObject` + // each return a Transform when called; `chain([...])` wires them. + /* eslint-disable @typescript-eslint/no-explicit-any */ + const pipeline = chain([ + Readable.from(buffer), + createGunzip(), + parser(), + pick({ filter: 'metrics' }), + streamObject(), + ]); + await new Promise((resolve, reject) => { + (pipeline as any).on('data', (chunk: unknown) => { + const { key, value } = chunk as { key: string; value: MetricMeta }; + if (TARGET_METRIC_KEYS.has(key)) collected[key] = value; + }); + (pipeline as any).on('end', resolve); + (pipeline as any).on('error', reject); + }); + /* eslint-enable @typescript-eslint/no-explicit-any */ + return extractServerMetricSamples(JSON.stringify({ metrics: collected })); +} + +export async function getAgenticAggregates( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const result: AgenticAggregateMap = {}; + + // Fast path: read the pre-computed `aggregate_stats` JSONB written by the + // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One + // round-trip pulls everything we need for every requested id with no blob + // decompression, so the slow blob-parsing fallback only runs for ids + // whose stats are missing or were produced by an older `STATS_VERSION`. + const statsRows = (await sql` + select + br.id as benchmark_result_id, + atr.aggregate_stats as stats + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${benchmarkResultIds}::bigint[]) + `) as { + benchmark_result_id: number; + stats: AggregateStatsRow | null; + }[]; + + const idsNeedingProfile: number[] = []; + const idsNeedingServer: number[] = []; + for (const row of statsRows) { + const id = Number(row.benchmark_result_id); + const agg = blankAggregate(id); + if (row.stats && Number(row.stats.version) === STATS_VERSION) { + agg.isl = row.stats.isl ?? null; + agg.osl = row.stats.osl ?? null; + agg.kvCacheUtil = row.stats.kvCacheUtil ?? null; + agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null; + } else { + // No stats (or stale version) — schedule the blob-parse fallback below + // so the response still surfaces data. Backfill should drain these. + idsNeedingProfile.push(id); + idsNeedingServer.push(id); + } + result[id] = agg; + } + // Also fall back for ids that didn't return a row at all (no trace_replay + // link) — keep the caller contract: every id we know about lands in the map. + for (const id of benchmarkResultIds) { + if (!(id in result)) result[id] = blankAggregate(id); + } + + if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) { + return result; + } + + // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ────── + for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) { + const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE); + const rows = (await sql` + select + br.id as benchmark_result_id, + atr.profile_export_jsonl_gz as profile_blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + `) as { benchmark_result_id: number; profile_blob: Buffer | null }[]; + for (const row of rows) { + const id = Number(row.benchmark_result_id); + result[id] ??= blankAggregate(id); + if (row.profile_blob) { + try { + const jsonl = gunzipSync(row.profile_blob).toString('utf8'); + const { isl, osl } = extractIslOsl(jsonl); + result[id].isl = percentilesOf(isl); + result[id].osl = percentilesOf(osl); + } catch { + // ignore malformed blob + } + } + } + } + // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ─────── + // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row + // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow + // path runs at most once per sibling set. + for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) { + const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE); + const rows = (await sql` + select + br.id as benchmark_result_id, + atr.server_metrics_json_gz as server_blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + `) as { benchmark_result_id: number; server_blob: Buffer | null }[]; + for (const row of rows) { + const id = Number(row.benchmark_result_id); + result[id] ??= blankAggregate(id); + if (!row.server_blob) continue; + let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null; + try { + const json = gunzipSync(row.server_blob).toString('utf8'); + parsed = extractServerMetricSamples(json); + } catch (error) { + // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose + // server_metrics_json decompresses past Node's max string length. + // Stream-parse to extract just the metric subtrees we care about. + const code = error && (error as NodeJS.ErrnoException).code; + const msg = error instanceof Error ? error.message : String(error); + if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) { + try { + parsed = await streamExtractServerMetricSamples(row.server_blob); + } catch { + // stream fallback failed too — leave nulls + } + } + } + if (parsed) { + result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil); + result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate); + } + } + } + return result; +} + +/** Shape of the JSONB column when read back via postgres-js. */ +interface AggregateStatsRow { + version: number; + isl: MetricPercentiles | null; + osl: MetricPercentiles | null; + kvCacheUtil: MetricPercentiles | null; + prefixCacheHitRate: MetricPercentiles | null; + normalizedSessionTimeS: number | null; + p90PrefillTpsPerUser: number | null; +} + +function blankAggregate(id: number): AgenticAggregate { + return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null }; +} diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts new file mode 100644 index 00000000..245a1170 --- /dev/null +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -0,0 +1,132 @@ +/** + * Find all benchmark_results that share the same SKU (hardware + framework + + * model + precision + spec_method + disagg + benchmark_type + workflow_run) + * as the given point. Used by the detail page to render a "switch between + * concs / parallelisms" navigator within a single run. + */ + +import type { DbClient } from '../connection.js'; + +export interface BenchmarkSibling { + id: number; + conc: number; + /** "on" | "off" | null. */ + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + prefill_tp: number; + prefill_ep: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + /** True if this row IS the point passed in. */ + is_current: boolean; + /** Whether the row has a stored trace_replay blob (for navigation hint). */ + has_trace: boolean; +} + +export interface BenchmarkSku { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + /** Human-readable workflow_run summary so the page header can hint at provenance. */ + github_run_id: number; + date: string; +} + +export interface BenchmarkSiblings { + sku: BenchmarkSku; + siblings: BenchmarkSibling[]; +} + +export async function getBenchmarkSiblings( + sql: DbClient, + benchmarkResultId: number, +): Promise { + // Step 1: resolve the SKU defining fields for the requested point. + const seed = (await sql` + select + c.hardware, c.framework, c.model, c.precision, c.spec_method, + br.benchmark_type, br.workflow_run_id, br.date::text, + wr.github_run_id + from benchmark_results br + join configs c on c.id = br.config_id + join workflow_runs wr on wr.id = br.workflow_run_id + where br.id = ${benchmarkResultId} + `) as unknown as { + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + benchmark_type: string; + workflow_run_id: number; + date: string; + github_run_id: number; + }[]; + const root = seed[0]; + if (!root) return null; + + // Step 2: pull every sibling row sharing the SKU within the same workflow_run. + const rows = (await sql` + select + br.id, br.conc, br.offload_mode, + c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep, + c.num_prefill_gpu, c.num_decode_gpu, c.disagg, + (br.trace_replay_id is not null) as has_trace + from benchmark_results br + join configs c on c.id = br.config_id + where br.workflow_run_id = ${root.workflow_run_id} + and br.benchmark_type = ${root.benchmark_type} + and c.hardware = ${root.hardware} + and c.framework = ${root.framework} + and c.model = ${root.model} + and c.precision = ${root.precision} + and c.spec_method = ${root.spec_method} + order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc + `) as unknown as { + id: number; + conc: number; + offload_mode: string | null; + decode_tp: number; + decode_ep: number; + prefill_tp: number; + prefill_ep: number; + num_prefill_gpu: number; + num_decode_gpu: number; + disagg: boolean; + has_trace: boolean; + }[]; + + const siblings: BenchmarkSibling[] = rows.map((r) => ({ + id: Number(r.id), + conc: r.conc, + offload_mode: r.offload_mode, + decode_tp: r.decode_tp, + decode_ep: r.decode_ep, + prefill_tp: r.prefill_tp, + prefill_ep: r.prefill_ep, + num_prefill_gpu: r.num_prefill_gpu, + num_decode_gpu: r.num_decode_gpu, + disagg: r.disagg, + is_current: Number(r.id) === benchmarkResultId, + has_trace: r.has_trace, + })); + + return { + sku: { + hardware: root.hardware, + framework: root.framework, + model: root.model, + precision: root.precision, + spec_method: root.spec_method, + benchmark_type: root.benchmark_type, + github_run_id: Number(root.github_run_id), + date: root.date, + }, + siblings, + }; +} diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..2291dc0c 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -1,6 +1,13 @@ import type { DbClient } from '../connection.js'; export interface BenchmarkRow { + /** + * Stable per-point id from benchmark_results. Used by the frontend to look + * up associated detail blobs (e.g. trace_replay histograms). + * Number is fine in TS but it's a Postgres bigint — Date arithmetic on huge + * runs is hypothetically lossy, in practice well below Number.MAX_SAFE_INTEGER. + */ + id: number; hardware: string; framework: string; model: string; @@ -18,9 +25,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces; numeric for single_turn fixed-seq runs. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -42,8 +53,56 @@ export async function getLatestBenchmarks( modelKey: string | string[], date?: string, exact?: boolean, + /** + * If set, filter to a specific GitHub Actions workflow run. + * Bypasses the "latest per config" logic — when two runs landed on the same + * date and the user picked one in the run selector, this scopes the chart + * data to that run only. Value matches the URL param `g_runid` (a + * stringified github_run_id, not the DB id). + */ + runId?: string, ): Promise { const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; + if (runId) { + const rows = await sql` + SELECT + br.id, + c.hardware, + c.framework, + c.model, + c.precision, + c.spec_method, + c.disagg, + c.is_multinode, + c.prefill_tp, + c.prefill_ep, + c.prefill_dp_attention, + c.prefill_num_workers, + c.decode_tp, + c.decode_ep, + c.decode_dp_attention, + c.decode_num_workers, + c.num_prefill_gpu, + c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, + br.isl, + br.osl, + br.conc, + br.image, + br.metrics, + br.date::text, + CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id + WHERE c.model = ANY(${modelKeys}) + AND br.error IS NULL + AND wr.github_run_id = ${runId}::bigint + ORDER BY br.config_id, br.conc, br.isl, br.osl + `; + return rows as unknown as BenchmarkRow[]; + } if (date) { // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest) // exact=true: only return data from this exact date (for GPU comparison) @@ -51,6 +110,7 @@ export async function getLatestBenchmarks( const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -68,6 +128,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -89,6 +151,7 @@ export async function getLatestBenchmarks( // No date filter: use materialized view for instant lookups const rows = await sql` SELECT + lb.id, c.hardware, c.framework, c.model, @@ -106,6 +169,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -153,6 +218,7 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, br.isl, br.osl, br.conc, diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts new file mode 100644 index 00000000..321434be --- /dev/null +++ b/packages/db/src/queries/derived-agentic-metrics.test.ts @@ -0,0 +1,96 @@ +import { describe, expect, it } from 'vitest'; + +import { computeDerivedFromBlob } from './derived-agentic-metrics.js'; + +/** Build one aiperf JSONL record for the synthetic fixture. */ +function rec( + conversation_id: string, + turn_index: number, + fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number }, +): string { + return JSON.stringify({ + metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' }, + metrics: { + request_latency: { value: fields.latency_ms, unit: 'ms' }, + time_to_first_token: { value: fields.ttft_ms, unit: 'ms' }, + input_sequence_length: { value: fields.isl, unit: 'tokens' }, + output_sequence_length: { value: fields.osl, unit: 'tokens' }, + }, + }); +} + +describe('computeDerivedFromBlob', () => { + it('returns nulls when no usable records', () => { + const out = computeDerivedFromBlob(''); + expect(out.normalized_session_time_s).toBeNull(); + expect(out.p90_prefill_tps_per_user).toBeNull(); + }); + + it('rescales single-session time and computes P90 prefill', () => { + // One session, two turns. load = (100+50) + (200+50) = 400. + // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s. + const jsonl = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }), + ].join('\n'); + const out = computeDerivedFromBlob(jsonl); + expect(out.normalized_session_time_s).toBeCloseTo(3, 6); + // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200. + expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6); + }); + + it('rescales times across sessions with unequal load', () => { + // s1: 1 turn, load = 100, T = 1s + // s2: 1 turn, load = 300, T = 3s + // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2 + // Mean T̃ = 2.0 + const jsonl = [ + rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }), + rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }), + ].join('\n'); + const out = computeDerivedFromBlob(jsonl); + expect(out.normalized_session_time_s).toBeCloseTo(2, 6); + }); + + it('drops records missing required fields and skips non-profiling phase', () => { + const lines = [ + rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }), + // missing TTFT — should be skipped + JSON.stringify({ + metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' }, + metrics: { + request_latency: { value: 1000, unit: 'ms' }, + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + // warmup phase — should be skipped + JSON.stringify({ + metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' }, + metrics: { + request_latency: { value: 9999, unit: 'ms' }, + time_to_first_token: { value: 9999, unit: 'ms' }, + input_sequence_length: { value: 100, unit: 'tokens' }, + output_sequence_length: { value: 50, unit: 'tokens' }, + }, + }), + ]; + const out = computeDerivedFromBlob(lines.join('\n')); + expect(out.normalized_session_time_s).toBeCloseTo(1, 6); + expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6); + }); + + it('p90 across turns: 10-turn session picks the right rank', () => { + // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910. + const turns = Array.from({ length: 10 }, (_, i) => + rec('s1', i, { + isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens + osl: 10, + ttft_ms: 1000, // 1 second → rates: 100..1000 tps + latency_ms: 1500, + }), + ); + const out = computeDerivedFromBlob(turns.join('\n')); + expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6); + }); +}); diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts new file mode 100644 index 00000000..35a4b76c --- /dev/null +++ b/packages/db/src/queries/derived-agentic-metrics.ts @@ -0,0 +1,264 @@ +/** + * Live-computed per-point metrics derived from the stored aiperf + * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB + * because they require grouping by `conversation_id` and aggregating per + * session — work that's cheap once per agentic point but adds up to be + * meaningful only when actually plotted. + * + * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal + * (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of + * per-turn `request_latency` per session (inter-turn tool/thinking gaps are + * inherently excluded since we only sum the active GPU time, not wallclock). + * Each session's time is rescaled by `mean_load / session_load`, where load + * is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions. + * + * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart. + * Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn + * in every session — the per-session percentile + cross-session mean + * sandwich was discarded because it just dampens tail behavior. + */ + +import { gunzipSync } from 'node:zlib'; + +import type { DbClient } from '../connection.js'; +import { STATS_VERSION } from './agentic-aggregates'; + +export interface DerivedAgenticMetric { + /** benchmark_results.id this entry belongs to. */ + id: number; + /** Mean normalized session time in seconds. */ + normalized_session_time_s: number | null; + /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */ + p90_prefill_tps_per_user: number | null; +} + +export type DerivedAgenticMetricMap = Record; + +/** + * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless + * HTTP driver caps responses at 64 MB — chunk to stay well under. + */ +const QUERY_CHUNK_SIZE = 6; + +interface RecordMetrics { + request_latency?: { value?: number; unit?: string } | number; + time_to_first_token?: { value?: number; unit?: string } | number; + input_sequence_length?: { value?: number } | number; + output_sequence_length?: { value?: number } | number; +} + +interface RecordMetadata { + conversation_id?: string; + turn_index?: number; + benchmark_phase?: string; +} + +interface ProfileRecord { + metadata?: RecordMetadata; + metrics?: RecordMetrics; +} + +interface TurnFields { + request_latency_ms: number; + ttft_ms: number; + isl: number; + osl: number; +} + +/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */ +function readNum(v: unknown): number | undefined { + if (typeof v === 'number') return v; + if (v && typeof v === 'object' && 'value' in v) { + const inner = (v as { value?: unknown }).value; + if (typeof inner === 'number' && Number.isFinite(inner)) return inner; + } + return undefined; +} + +function extractTurn(rec: ProfileRecord): TurnFields | null { + const m = rec.metrics ?? {}; + const rl = readNum(m.request_latency); + const tt = readNum(m.time_to_first_token); + const isl = readNum(m.input_sequence_length); + const osl = readNum(m.output_sequence_length); + if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null; + if (rl <= 0 || tt <= 0 || isl <= 0) return null; + return { request_latency_ms: rl, ttft_ms: tt, isl, osl }; +} + +/** Linear-interpolated percentile (matches numpy's default linear method). */ +function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 0) return Number.NaN; + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +function meanOf(xs: number[]): number { + if (xs.length === 0) return Number.NaN; + let s = 0; + for (const x of xs) s += x; + return s / xs.length; +} + +/** + * Parse one point's JSONL and return the two derived metrics. Returns + * `{ session_time: null, prefill: null }` if the blob has no usable records. + */ +export function computeDerivedFromBlob(jsonl: string): { + normalized_session_time_s: number | null; + p90_prefill_tps_per_user: number | null; +} { + // Group records by conversation_id, filter to the profiling phase. + const bySession = new Map(); + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: ProfileRecord; + try { + rec = JSON.parse(line) as ProfileRecord; + } catch { + continue; + } + if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue; + const sid = rec.metadata?.conversation_id; + if (!sid) continue; + const turn = extractTurn(rec); + if (!turn) continue; + let list = bySession.get(sid); + if (!list) { + list = []; + bySession.set(sid, list); + } + list.push(turn); + } + if (bySession.size === 0) { + return { normalized_session_time_s: null, p90_prefill_tps_per_user: null }; + } + + // Per-session aggregates for session time; per-turn prefill rates pool into + // a single global array so the percentile sees the full distribution. + const sessionTimesS: number[] = []; + const sessionLoads: number[] = []; + const allPrefillRates: number[] = []; + for (const turns of bySession.values()) { + let timeMs = 0; + let load = 0; + for (const t of turns) { + timeMs += t.request_latency_ms; + load += t.isl + t.osl; + const ttftSec = t.ttft_ms / 1000; + if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec); + } + if (load > 0) { + sessionTimesS.push(timeMs / 1000); + sessionLoads.push(load); + } + } + + // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean. + let normalized: number | null = null; + if (sessionTimesS.length > 0) { + const meanLoad = meanOf(sessionLoads); + if (meanLoad > 0) { + const scaled: number[] = []; + for (let i = 0; i < sessionTimesS.length; i++) { + const ti = sessionTimesS[i]!; + const li = sessionLoads[i]!; + if (li > 0) scaled.push(ti * (meanLoad / li)); + } + normalized = scaled.length > 0 ? meanOf(scaled) : null; + } + } + + let prefill: number | null = null; + if (allPrefillRates.length > 0) { + allPrefillRates.sort((a, b) => a - b); + prefill = quantile(allPrefillRates, 0.9); + } + + return { + normalized_session_time_s: normalized, + p90_prefill_tps_per_user: prefill, + }; +} + +export async function getDerivedAgenticMetrics( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const result: DerivedAgenticMetricMap = {}; + + // Fast path: read the pre-computed values out of `aggregate_stats`. The + // ingest pipeline computes both metrics in the same pass that produces the + // percentile bundles, so a single SQL round-trip covers most ids without + // touching the gzipped profile blob. + const statsRows = (await sql` + select + br.id as benchmark_result_id, + atr.aggregate_stats as stats + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${benchmarkResultIds}::bigint[]) + `) as { + benchmark_result_id: number; + stats: { + version?: number; + normalizedSessionTimeS?: number | null; + p90PrefillTpsPerUser?: number | null; + } | null; + }[]; + + const idsNeedingBlob: number[] = []; + for (const row of statsRows) { + const id = Number(row.benchmark_result_id); + if (row.stats && Number(row.stats.version) === STATS_VERSION) { + result[id] = { + id, + normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null, + p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null, + }; + } else { + idsNeedingBlob.push(id); + } + } + + if (idsNeedingBlob.length === 0) return result; + + // Fallback: parse the profile blob directly. Used for rows whose + // `aggregate_stats` is null or computed by an older STATS_VERSION; the + // backfill script drains the population so this path should be rare. + const rows: { benchmark_result_id: number; blob: Buffer }[] = []; + for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) { + const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE); + const chunkRows = (await sql` + select + br.id as benchmark_result_id, + atr.profile_export_jsonl_gz as blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + and atr.profile_export_jsonl_gz is not null + `) as { benchmark_result_id: number; blob: Buffer }[]; + rows.push(...chunkRows); + } + + for (const row of rows) { + try { + const jsonl = gunzipSync(row.blob).toString('utf8'); + const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl); + result[Number(row.benchmark_result_id)] = { + id: Number(row.benchmark_result_id), + normalized_session_time_s, + p90_prefill_tps_per_user, + }; + } catch { + // Skip malformed blobs silently — frontend treats missing ids as "no data". + } + } + return result; +} diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts new file mode 100644 index 00000000..2bd3e251 --- /dev/null +++ b/packages/db/src/queries/request-timeline.ts @@ -0,0 +1,48 @@ +/** + * Per-request timeline for the agentic detail page's Gantt view. + * + * Backed by `agentic_trace_replay.request_timeline` (pre-computed at + * ingest time, see `etl/compute-request-timeline.ts`). The fast path is + * a single SQL row read; the slow path re-computes from + * `profile_export_jsonl_gz` and is only taken when the column is missing + * or the stored `REQUEST_TIMELINE_VERSION` is stale. + */ + +import { + REQUEST_TIMELINE_VERSION, + computeRequestTimeline, + type RequestTimeline, +} from '../etl/compute-request-timeline'; + +import type { DbClient } from '../connection.js'; + +export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline'; + +interface RawRow { + blob: Buffer | null; + request_timeline: RequestTimeline | null; +} + +export async function getRequestTimeline( + sql: DbClient, + benchmarkResultId: number, +): Promise { + const rows = (await sql` + select + atr.profile_export_jsonl_gz as blob, + atr.request_timeline + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = ${benchmarkResultId} + `) as unknown as RawRow[]; + const row = rows[0]; + if (!row) return null; + + // Fast path: pre-computed timeline at the current version. + if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) { + return row.request_timeline; + } + + // Slow path: recompute from the blob (rare — only stale/missing rows). + return computeRequestTimeline(row.blob); +} diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts new file mode 100644 index 00000000..20ebc0d5 --- /dev/null +++ b/packages/db/src/queries/trace-histograms.ts @@ -0,0 +1,95 @@ +/** + * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl` + * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller + * passes the set of `benchmark_results.id`s it wants and receives one entry + * per id that actually has a trace_replay blob (others are silently skipped). + * + * The JSONL has one JSON object per request with the shape: + * { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } } + * + * Returns raw arrays rather than pre-binned histograms — payload stays tiny + * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin + * however it wants. + */ + +import { gunzipSync } from 'node:zlib'; + +import type { DbClient } from '../connection.js'; + +export interface TraceHistogramPoint { + /** benchmark_results.id this entry belongs to. */ + id: number; + /** Input sequence length (tokens) per completed request. */ + isl: number[]; + /** Output sequence length (tokens) per completed request. */ + osl: number[]; +} + +export type TraceHistogramMap = Record; + +/** + * Cap the number of blobs we pull in a single Neon HTTP query — the serverless + * driver returns 507 ("response is too large, max 64 MB") if the combined gzip + * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB + * compressed, so we stay well below the cap at 12. + */ +const QUERY_CHUNK_SIZE = 12; + +export async function getTraceHistograms( + sql: DbClient, + benchmarkResultIds: number[], +): Promise { + if (benchmarkResultIds.length === 0) return {}; + + const rows: { benchmark_result_id: number; blob: Buffer }[] = []; + for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) { + const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE); + const chunkRows = (await sql` + select + br.id as benchmark_result_id, + atr.profile_export_jsonl_gz as blob + from benchmark_results br + join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = any(${chunk}::bigint[]) + and atr.profile_export_jsonl_gz is not null + `) as { benchmark_result_id: number; blob: Buffer }[]; + rows.push(...chunkRows); + } + + const result: TraceHistogramMap = {}; + for (const row of rows) { + try { + const jsonl = gunzipSync(row.blob).toString('utf8'); + const isl: number[] = []; + const osl: number[] = []; + for (const line of jsonl.split('\n')) { + if (!line) continue; + let rec: { metrics?: Record }; + try { + rec = JSON.parse(line); + } catch { + continue; + } + const m = rec.metrics ?? {}; + const islVal = readMetric(m['input_sequence_length']); + const oslVal = readMetric(m['output_sequence_length']); + if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal); + if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal); + } + result[Number(row.benchmark_result_id)] = { + id: Number(row.benchmark_result_id), + isl, + osl, + }; + } catch { + // Drop malformed blobs silently — caller treats missing ids as "no data". + } + } + return result; +} + +function readMetric(v: { value?: number } | number | undefined): number | undefined { + if (v === undefined || v === null) return undefined; + if (typeof v === 'number') return v; + return v.value; +} diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts new file mode 100644 index 00000000..624b6ed3 --- /dev/null +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -0,0 +1,156 @@ +/** + * Time-series view of one agentic benchmark point: chart-ready arrays for + * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS, + * and per-source prompt-token counts. + * + * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest + * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL + * row read; the slow path re-computes from `server_metrics_json_gz` and is + * only taken when the column is missing or the stored + * `CHART_SERIES_VERSION` is stale (the backfill script should drain that). + */ + +import { + CHART_SERIES_VERSION, + computeChartSeries, + type ChartSeries, + type QueueDepthPoint, + type TimeSeriesPoint, +} from '../etl/compute-chart-series'; + +import type { DbClient } from '../connection.js'; + +export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series'; + +export interface PointMeta { + id: number; + hardware: string; + framework: string; + model: string; + precision: string; + spec_method: string; + disagg: boolean; + conc: number; + offload_mode: string | null; + isl: number | null; + osl: number | null; + benchmark_type: string; + date: string; + /** GitHub Actions run URL for jumping to the source. */ + run_url: string | null; + /** Cumulative end-of-run cache-hit number the dashboard already shows. */ + server_gpu_cache_hit_rate: number | null; + /** Cumulative end-of-run CPU offload cache-hit. */ + server_cpu_cache_hit_rate: number | null; +} + +export interface TraceServerMetrics { + /** Point context — hardware, model, conc, etc. for the page header. */ + meta: PointMeta; + /** ns wall-clock of the first window's start; for debugging only. */ + startNs: number; + /** ns wall-clock of the last window's end. */ + endNs: number; + /** Total benchmark window in seconds. */ + durationS: number; + /** Number of 1Hz windows captured. */ + timeslicesCount: number; + /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */ + kvCacheUsage: TimeSeriesPoint[]; + /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */ + prefixCacheHitRate: TimeSeriesPoint[]; + /** Request queue depth: running, waiting, total per scrape. */ + queueDepth: QueueDepthPoint[]; + /** + * Per-source prompt-token counts over time (counter rate per scrape). + * Keyed by the value of the `source` label (typically `local_cache_hit`, + * `external_cache_hit`, `miss`, etc.). Plot as stacked area. + */ + promptTokensBySource: Record; + /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */ + prefillTps: TimeSeriesPoint[]; + /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */ + decodeTps: TimeSeriesPoint[]; +} + +interface RawMetaRow extends PointMeta { + blob: Buffer | null; + chart_series: ChartSeries | null; +} + +function buildMeta(row: RawMetaRow): PointMeta { + return { + id: Number(row.id), + hardware: row.hardware, + framework: row.framework, + model: row.model, + precision: row.precision, + spec_method: row.spec_method, + disagg: row.disagg, + conc: row.conc, + offload_mode: row.offload_mode, + isl: row.isl, + osl: row.osl, + benchmark_type: row.benchmark_type, + date: row.date, + run_url: row.run_url, + server_gpu_cache_hit_rate: + row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate), + server_cpu_cache_hit_rate: + row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate), + }; +} + +function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { + return { + meta, + startNs: series.startNs, + endNs: series.endNs, + durationS: series.durationS, + timeslicesCount: series.timeslicesCount, + kvCacheUsage: series.kvCacheUsage, + prefixCacheHitRate: series.prefixCacheHitRate, + queueDepth: series.queueDepth, + promptTokensBySource: series.promptTokensBySource, + prefillTps: series.prefillTps, + decodeTps: series.decodeTps, + }; +} + +export async function getTraceServerMetrics( + sql: DbClient, + benchmarkResultId: number, +): Promise { + const rows = (await sql` + select + atr.server_metrics_json_gz as blob, + atr.chart_series, + br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg, + br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type, + br.date::text, + case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url, + (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate, + (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate + from benchmark_results br + join configs c on c.id = br.config_id + join workflow_runs wr on wr.id = br.workflow_run_id + left join agentic_trace_replay atr on atr.id = br.trace_replay_id + where br.id = ${benchmarkResultId} + `) as unknown as RawMetaRow[]; + const row = rows[0]; + if (!row) return null; + if (!row.blob) return null; + const meta = buildMeta(row); + + // Fast path: pre-computed chart_series at the current version. + if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) { + return merge(meta, row.chart_series); + } + + // Slow path: compute from the blob. `computeChartSeries` handles + // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP + // rows succeed even before the backfill drains them. + const series = await computeChartSeries(row.blob); + if (!series) return null; + return merge(meta, series); +} diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index b4e4f255..d5e2d933 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` - SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text + SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text FROM availability a WHERE EXISTS ( SELECT 1 @@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise= 0.4'} + stream-chain@3.6.3: + resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==} + stream-combiner@0.2.2: resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==} + stream-json@2.1.0: + resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==} + string-width@4.2.3: resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} engines: {node: '>=8'} @@ -7392,6 +7413,15 @@ snapshots: '@types/stats.js@0.17.4': {} + '@types/stream-chain@2.1.0': + dependencies: + '@types/node': 25.7.0 + + '@types/stream-json@1.7.8': + dependencies: + '@types/node': 25.7.0 + '@types/stream-chain': 2.1.0 + '@types/three@0.184.1': dependencies: '@dimforge/rapier3d-compat': 0.12.0 @@ -10752,11 +10782,17 @@ snapshots: es-errors: 1.3.0 internal-slot: 1.1.0 + stream-chain@3.6.3: {} + stream-combiner@0.2.2: dependencies: duplexer: 0.1.2 through: 2.3.8 + stream-json@2.1.0: + dependencies: + stream-chain: 3.6.3 + string-width@4.2.3: dependencies: emoji-regex: 8.0.0