;
@@ -1839,6 +2071,9 @@ const ScatterGraph = React.memo(
chartDefinition.chartType,
xScaleConfig._isLog,
yScaleConfig.type,
+ optimalPointKeys,
+ getCssColor,
+ resolveColor,
],
);
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index f9b1b3c8..73018483 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
`${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
]
}{' '}
- {graph.chartDefinition[
- `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
- ] || graph.chartDefinition.heading}
+ {graph.chartDefinition.heading}
{graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..589ba580 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -157,12 +157,12 @@ describe('processOverlayChartData', () => {
});
it('remaps x to config override for input metrics on interactivity chart', () => {
- // inputTputPerGpu has x override to p99_ttft on interactivity chart
+ // inputTputPerGpu has x override to p90_ttft on interactivity chart
const data = [
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_intvty: 50,
} as any),
];
@@ -176,16 +176,11 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- median_ttft: 0.1,
+ p90_ttft: 0.1,
median_intvty: 50,
} as any),
];
- const result = processOverlayChartData(
- data,
- 'interactivity',
- 'y_inputTputPerGpu',
- 'median_ttft',
- );
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.1);
});
@@ -195,76 +190,62 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_e2el: 2.5,
} as any),
];
const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
expect(result).toHaveLength(1);
- // e2e uses median_e2el as x (from chart config default), not p99_ttft
+ // e2e uses median_e2el as x (from chart config default), not p90_ttft
expect(result[0].x).toBe(2.5);
});
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
- const data = [
- pt({
- x: 100,
- tpPerGpu: { y: 42, roof: false },
- p99_ttft: 0.35,
- median_e2el: 2.5,
- } as any),
- ];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
- expect(result).toHaveLength(1);
- expect(result[0].x).toBe(0.35);
- });
-
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+ it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
const data = [
pt({
x: 100,
tpPerGpu: { y: 42, roof: false },
- median_ttft: 0.12,
+ p90_ttft: 0.12,
median_e2el: 2.5,
} as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.12);
});
it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
const data = [
- pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
- pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+ pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+ pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
// y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
it('does not filter interactivity points by latency limit when x-axis is default', () => {
- // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+ // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
// chart's x-axis stays median_intvty for non-input metrics. The latency limit
// (60) must NOT apply to median_intvty values.
const data = [
pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(2);
});
it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
- // When an input metric IS selected and x-axis overrides to p99_ttft,
+ // When an input metric IS selected and x-axis overrides to p90_ttft,
// the latency limit should apply.
const data = [
- pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
- pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+ pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+ pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
- // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+ // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..4876c614 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -75,11 +75,13 @@ export function processOverlayChartData(
chartType: 'e2e' | 'interactivity',
selectedYAxisMetric: string,
selectedXAxisMetric: string | null,
+ options?: { isAgentic?: boolean },
): InferenceData[] {
const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
if (!chartDef) return [];
const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+ const isAgentic = options?.isAgentic === true;
// Resolve x-axis field (must match useChartData logic)
const metricTitle =
@@ -87,9 +89,11 @@ export function processOverlayChartData(
const isInputMetric = metricTitle.toLowerCase().includes('input');
let xAxisField: string = chartDef.x;
// selectedXAxisMetric is already the effective metric for this chart type
- // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
+ // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+ // Match any *_ttft metric — the x-axis-mode picker can now select any
+ // percentile (median/p75/p90/p99) depending on sequence kind.
const isTtftOverride =
- selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+ typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
xAxisField = selectedXAxisMetric;
@@ -109,7 +113,12 @@ export function processOverlayChartData(
})
.filter(
(d) =>
- xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+ // Skip the latency limit for the natural x-axis or for agentic
+ // (long TTFTs are normal there, not overload outliers).
+ xAxisField === chartDef.x ||
+ isAgentic ||
+ !chartDef.y_latency_limit ||
+ d.x <= chartDef.y_latency_limit,
);
return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 4c56d217..ccc371f9 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -19,6 +19,13 @@ export interface TooltipConfig {
isTracked?: boolean;
/** URL to the GitHub Actions workflow run */
runUrl?: string;
+ /**
+ * Per-request ISL/OSL arrays for agentic points, sourced from the stored
+ * aiperf `profile_export.jsonl`. Used to detect whether the point has any
+ * trace data (so the "View charts" button can appear); the actual
+ * distributions are rendered on the detail page, not inline.
+ */
+ traceHistogram?: { isl: number[]; osl: number[] } | undefined;
}
export interface OverlayTooltipConfig extends TooltipConfig {
@@ -88,6 +95,74 @@ const runLinkHTML = (runUrl?: string) =>
const tooltipLine = (label: string, value: string | number) =>
`
${label}: ${value}
`;
+const formatPct = (v: number | undefined): string | null =>
+ v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+ if (!Number.isFinite(v)) return String(v);
+ const rounded = parseFloat(v.toFixed(3));
+ if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+ return String(rounded);
+};
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+ if (d.benchmark_type !== 'agentic_traces') return '';
+
+ const parts: string[] = [];
+ if (d.offload_mode) {
+ parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+ }
+
+ const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+ const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+ const theoHit = formatPct(d.theoretical_cache_hit_rate);
+ if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+ if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+ if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+ if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+ const successPct =
+ d.num_requests_total > 0
+ ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+ : '';
+ parts.push(
+ tooltipLine(
+ 'Requests',
+ `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+ ),
+ );
+ }
+
+ if (d.total_prompt_tokens !== undefined) {
+ parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+ }
+ if (d.total_generation_tokens !== undefined) {
+ parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+ }
+
+ // Histograms + time-series live on the dedicated detail page now; the
+ // "View charts" button (rendered by the wrapper when pinned + has trace
+ // data) takes the user there.
+
+ return parts.join('');
+};
+
+/** "View charts" button — only visible when the tooltip is pinned and the
+ * point has stored trace data. Wired up by the ScatterGraph click handler. */
+const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
+ if (!isPinned || !hasTraceData) return '';
+ return `View charts → `;
+};
+
const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
const imageTooltipLine = (image: string) =>
@@ -138,7 +213,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
* @returns HTML string for the tooltip content
*/
export const generateTooltipContent = (config: TooltipConfig): string => {
- const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+ const {
+ data: d,
+ isPinned,
+ xLabel,
+ yLabel,
+ selectedYAxisMetric,
+ hardwareConfig,
+ runUrl,
+ traceHistogram,
+ } = config;
return `
@@ -156,16 +240,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -173,7 +257,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -182,10 +266,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
+ ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
${
isPinned
? `
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${tooltipLine('Total GPUs', d.tp)}
${generateParallelismHTML(d)}
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`;
};
@@ -271,16 +358,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -288,7 +375,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -297,9 +384,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
`;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index a9e087b2..19b4bfb0 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -5,17 +5,30 @@ import { Info } from 'lucide-react';
import { LabelWithTooltip } from '@/components/ui/label-with-tooltip';
import { track } from '@/lib/analytics';
import { MultiSelect } from '@/components/ui/multi-select';
+import {
+ Select,
+ SelectContent,
+ SelectGroup,
+ SelectItem,
+ SelectLabel,
+ SelectTrigger,
+ SelectValue,
+} from '@/components/ui/select';
import { TooltipContent, TooltipRoot, TooltipTrigger } from '@/components/ui/tooltip';
import {
type Model,
type Precision,
type Sequence,
+ type Percentile,
+ PERCENTILE_OPTIONS,
getModelCategory,
getModelLabel,
+ getPercentileLabel,
getPrecisionLabel,
getSequenceCategory,
getSequenceLabel,
groupByCategory,
+ sequenceKind,
} from '@/lib/data-mappings';
function DeprecatedSectionTitle({ reason }: { reason: string }) {
@@ -200,6 +213,132 @@ export function SequenceSelector({
);
}
+interface ScenarioSelectorProps {
+ id?: string;
+ value: string;
+ onChange: (value: Sequence) => void;
+ open?: boolean;
+ onOpenChange?: (open: boolean) => void;
+ availableSequences: string[];
+ 'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+ id = 'scenario-select',
+ value,
+ onChange,
+ open,
+ onOpenChange,
+ availableSequences,
+ 'data-testid': testId,
+}: ScenarioSelectorProps) {
+ const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+ const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+ const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+ return (
+
+
+ {
+ track('selector_scenario_changed', { scenario: v });
+ onChange(v as Sequence);
+ }}
+ open={open}
+ onOpenChange={onOpenChange}
+ >
+
+
+
+
+ {fixedSeq.length > 0 && (
+
+ Fixed Sequence Length
+ {fixedGroups.default.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+ {fixedGroups.deprecated.length > 0 && (
+ <>
+
+ {fixedGroups.deprecated.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+ >
+ )}
+
+ )}
+ {agentic.map((seq) => (
+
+ {getSequenceLabel(seq as Sequence)}
+
+ ))}
+
+
+
+ );
+}
+
+interface PercentileSelectorProps {
+ id?: string;
+ value: string;
+ onChange: (value: Percentile) => void;
+ 'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+ id = 'percentile-select',
+ value,
+ onChange,
+ 'data-testid': testId,
+}: PercentileSelectorProps) {
+ return (
+
+
+ {
+ track('selector_percentile_changed', { percentile: v });
+ onChange(v as Percentile);
+ }}
+ >
+
+
+
+
+ {PERCENTILE_OPTIONS.map((p) => (
+
+ {getPercentileLabel(p)}
+
+ ))}
+
+
+
+ );
+}
+
interface PrecisionSelectorProps {
id?: string;
value: string[];
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
'use client';
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+ tooltipRef,
+ pinned,
+}: {
+ tooltipRef: React.RefObject
;
+ pinned: boolean;
+}) {
+ const [mounted, setMounted] = useState(false);
+ useEffect(() => setMounted(true), []);
+ const node = (
+
+ );
+ if (!mounted || typeof document === 'undefined') return node;
+ return createPortal(node, document.body);
+}
export interface D3ChartWrapperProps {
chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
}
}}
/>
-
+ {/* Tooltip is portalled to with position:fixed so it can
+ rise above sibling chart cards' stacking contexts. The d3 layer
+ writes viewport-coords into style.left/top — see
+ computeTooltipPosition. */}
+
{noDataOverlay}
{instructions}
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index 1863060d..3c24d32b 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r
/** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */
function stubRow(overrides: Partial = {}): BenchmarkRow {
return {
+ id: 1,
hardware: 'h200',
framework: 'sglang',
model: 'dsr1',
@@ -29,6 +30,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow {
decode_num_workers: 0,
num_prefill_gpu: 8,
num_decode_gpu: 8,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
isl: 1024,
osl: 1024,
conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 6fd3aba1..dd2b0dbf 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
import { computeToggle } from '@/hooks/useTogglableSet';
import type { BenchmarkRow, EvalRow } from '@/lib/api';
import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -110,7 +110,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
const groups = new Map();
for (const row of benchmarks) {
const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
- const sequence = islOslToSequence(row.isl, row.osl);
+ const sequence = rowToSequence(row);
if (!sequence) continue;
const key = `${displayModel}_${sequence}`;
if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..4ca25ee2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,45 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface MetricPercentiles {
+ mean: number;
+ p50: number;
+ p75: number;
+ p90: number;
+ p99: number;
+ n: number;
+}
+
+export interface AgenticAggregate {
+ id: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record;
+
+async function fetchAgenticAggregates(
+ ids: number[],
+ signal?: AbortSignal,
+): Promise {
+ if (ids.length === 0) return {};
+ const res = await fetch(`/api/v1/agentic-aggregates?ids=${ids.join(',')}`, { signal });
+ if (!res.ok) throw new Error(`agentic-aggregates ${res.status}`);
+ return (await res.json()) as AgenticAggregateMap;
+}
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+ const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+ return useQuery({
+ queryKey: ['agentic-aggregates', sortedKey.join(',')] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) => fetchAgenticAggregates(sortedKey, signal),
+ enabled: enabled && sortedKey.length > 0,
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..1ea90c0d
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,46 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface BenchmarkSibling {
+ id: number;
+ conc: number;
+ offload_mode: string | null;
+ decode_tp: number;
+ decode_ep: number;
+ prefill_tp: number;
+ prefill_ep: number;
+ num_prefill_gpu: number;
+ num_decode_gpu: number;
+ disagg: boolean;
+ is_current: boolean;
+ has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ benchmark_type: string;
+ github_run_id: number;
+ date: string;
+}
+
+export interface BenchmarkSiblings {
+ sku: BenchmarkSku;
+ siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+ return useQuery({
+ queryKey: ['benchmark-siblings', id] as const,
+ queryFn: async ({ signal }) => {
+ const res = await fetch(`/api/v1/benchmark-siblings?id=${id}`, { signal });
+ if (res.status === 404) return null;
+ if (!res.ok) throw new Error(`benchmark-siblings ${res.status}`);
+ return (await res.json()) as BenchmarkSiblings;
+ },
+ enabled: id !== null && id > 0,
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
index 7329896d..c4f49130 100644
--- a/packages/app/src/hooks/api/use-benchmarks.test.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -5,12 +5,29 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
describe('benchmarkQueryOptions', () => {
it('builds query key from model and date', () => {
const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01');
- expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']);
+ expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest', '']);
});
it('builds exact query key when exact=true', () => {
const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
- expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']);
+ expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', '']);
+ });
+
+ it('includes runId in query key when provided', () => {
+ const opts = benchmarkQueryOptions(
+ 'DeepSeek-R1-0528',
+ '2026-03-01',
+ true,
+ false,
+ '26194160120',
+ );
+ expect(opts.queryKey).toEqual([
+ 'benchmarks',
+ 'DeepSeek-R1-0528',
+ '2026-03-01',
+ 'latest',
+ '26194160120',
+ ]);
});
it('produces distinct keys for different models', () => {
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index 6da1568e..8fd1f4e9 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,14 +8,16 @@ export function benchmarkQueryOptions(
date: string,
enabled = true,
exact?: boolean,
+ runId?: string,
) {
return {
- queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const,
- queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal),
+ queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? ''] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) =>
+ fetchBenchmarks(model, date, exact, signal, runId),
enabled: enabled && Boolean(model),
};
}
-export function useBenchmarks(model: string, date?: string, enabled = true) {
- return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled));
+export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
+ return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
}
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..6bc7ae5e
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,41 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface DerivedAgenticMetric {
+ id: number;
+ /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+ * by mean_load / session_load. Null when the JSONL had no usable records. */
+ normalized_session_time_s: number | null;
+ /** P90 of per-turn ISL/TTFT across every turn in every session.
+ * Null when no prefill rates could be computed. */
+ p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record;
+
+async function fetchDerivedAgenticMetrics(
+ ids: number[],
+ signal?: AbortSignal,
+): Promise {
+ if (ids.length === 0) return {};
+ const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal });
+ if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
+ return (await res.json()) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+ const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+ return useQuery({
+ queryKey: ['derived-agentic-metrics', sortedKey.join(',')] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) => fetchDerivedAgenticMetrics(sortedKey, signal),
+ enabled: enabled && sortedKey.length > 0,
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..d3ceaab8
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface RequestRecord {
+ /** Conversation id (groups turns of one agent session). */
+ cid: string;
+ /** Zero-based turn index within the conversation. */
+ ti: number;
+ /** Worker id (concurrency slot that handled this request). */
+ wid: string;
+ /** Sub-agent depth (0 = top-level). */
+ ad: number;
+ /** `warmup` or `profiling`. */
+ phase: string;
+ /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+ credit: number;
+ /** ns offset from timeline.startNs. HTTP send started. */
+ start: number;
+ /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+ ack: number | null;
+ /** ns offset from timeline.startNs. Last byte received. */
+ end: number;
+ ttftMs: number | null;
+ isl: number | null;
+ osl: number | null;
+ cancelled: boolean;
+}
+
+export interface RequestTimeline {
+ version: number;
+ startNs: number;
+ endNs: number;
+ durationS: number;
+ requests: RequestRecord[];
+}
+
+async function fetchRequestTimeline(
+ id: number,
+ signal?: AbortSignal,
+): Promise {
+ const res = await fetch(`/api/v1/request-timeline?id=${id}`, { signal });
+ if (res.status === 404) return null;
+ if (!res.ok) throw new Error(`request-timeline ${res.status}`);
+ return (await res.json()) as RequestTimeline;
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+ return useQuery({
+ queryKey: ['request-timeline', id] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) =>
+ id ? fetchRequestTimeline(id, signal) : Promise.resolve(null),
+ enabled: enabled && Boolean(id),
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..db4220d2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,39 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TraceHistogramPoint {
+ id: number;
+ /** Input sequence length (tokens) per completed request. */
+ isl: number[];
+ /** Output sequence length (tokens) per completed request. */
+ osl: number[];
+}
+
+export type TraceHistogramMap = Record;
+
+async function fetchTraceHistograms(
+ ids: number[],
+ signal?: AbortSignal,
+): Promise {
+ if (ids.length === 0) return {};
+ const res = await fetch(`/api/v1/trace-histograms?ids=${ids.join(',')}`, { signal });
+ if (!res.ok) throw new Error(`trace-histograms ${res.status}`);
+ return (await res.json()) as TraceHistogramMap;
+}
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+ const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+ return useQuery({
+ queryKey: ['trace-histograms', sortedKey.join(',')] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceHistograms(sortedKey, signal),
+ enabled: enabled && sortedKey.length > 0,
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..8418aa4f
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,70 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TimeSeriesPoint {
+ /** Seconds from benchmark start. */
+ t: number;
+ value: number;
+}
+export interface QueueDepthPoint {
+ t: number;
+ running: number;
+ waiting: number;
+ total: number;
+}
+export interface PointMeta {
+ id: number;
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ disagg: boolean;
+ conc: number;
+ offload_mode: string | null;
+ isl: number | null;
+ osl: number | null;
+ benchmark_type: string;
+ date: string;
+ run_url: string | null;
+ server_gpu_cache_hit_rate: number | null;
+ server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+ meta: PointMeta;
+ startNs: number;
+ endNs: number;
+ durationS: number;
+ timeslicesCount: number;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ prefillTps: TimeSeriesPoint[];
+ decodeTps: TimeSeriesPoint[];
+}
+
+async function fetchTraceServerMetrics(
+ id: number,
+ signal?: AbortSignal,
+): Promise {
+ const res = await fetch(`/api/v1/trace-server-metrics?id=${id}`, { signal });
+ if (res.status === 404) return null;
+ if (!res.ok) throw new Error(`trace-server-metrics ${res.status}`);
+ return (await res.json()) as TraceServerMetrics;
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+ return useQuery({
+ queryKey: ['trace-server-metrics', id] as const,
+ queryFn: ({ signal }: { signal: AbortSignal }) =>
+ id ? fetchTraceServerMetrics(id, signal) : Promise.resolve(null),
+ enabled: enabled && Boolean(id),
+ staleTime: 5 * 60 * 1000,
+ });
+}
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 999cbfde..31cf906a 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -6,6 +6,8 @@
import type { SubmissionsResponse } from './submissions-types';
export interface BenchmarkRow {
+ /** Stable per-point id from benchmark_results; used to look up trace histograms. */
+ id: number;
hardware: string;
framework: string;
model: string;
@@ -23,9 +25,13 @@ export interface BenchmarkRow {
decode_num_workers: number;
num_prefill_gpu: number;
num_decode_gpu: number;
- isl: number;
- osl: number;
+ benchmark_type: string;
+ // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+ isl: number | null;
+ osl: number | null;
conc: number;
+ /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+ offload_mode: string;
image: string | null;
metrics: Record;
date: string;
@@ -115,10 +121,13 @@ export function fetchBenchmarks(
date?: string,
exact?: boolean,
signal?: AbortSignal,
+ /** Optional github_run_id to scope to a specific workflow run. */
+ runId?: string,
) {
const params = new URLSearchParams({ model });
if (date) params.set('date', date);
if (exact) params.set('exact', 'true');
+ if (runId) params.set('runId', runId);
return fetchJson(`/api/v1/benchmarks?${params}`, signal);
}
@@ -141,13 +150,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
export interface AvailabilityRow {
model: string;
- isl: number;
- osl: number;
+ // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
spec_method: string;
disagg: boolean;
+ benchmark_type: string;
date: string;
}
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index be76438e..fcbca681 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -6,6 +6,7 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform
function makeRow(overrides: Partial = {}): BenchmarkRow {
return {
+ id: 1,
hardware: 'h200',
framework: 'trt',
model: 'dsr1',
@@ -23,6 +24,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow {
decode_num_workers: 0,
num_prefill_gpu: 8,
num_decode_gpu: 8,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
isl: 1024,
osl: 1024,
conc: 64,
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 107f0b12..3594750c 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -15,10 +15,42 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
import { getHardwareConfig } from '@/lib/constants';
import type { BenchmarkRow } from '@/lib/api';
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ * e2el ≡ ttlt (time-to-last-token == end-to-end latency)
+ * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output)
+ * intvty ≡ 1/itl (tok/s from the user's perspective)
+ * Existing fields win if present; we only fill in the gaps.
+ */
+function agenticAliases(m: Record): Record {
+ const out: Record = {};
+ for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) {
+ const itl = m[`${suffix}_itl`];
+ const ttlt = m[`${suffix}_ttlt`];
+ if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
+ if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
+ if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) {
+ out[`${suffix}_intvty`] = 1 / itl;
+ }
+ }
+ return out;
+}
+
/** Convert a DB benchmark row to an AggDataEntry. */
export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
- const m = row.metrics;
+ const isAgentic = row.benchmark_type === 'agentic_traces';
+ const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+ // Prefer the dedicated column (added in migration 004); fall back to the
+ // legacy stash inside `metrics` for any rows ingested before that column
+ // existed.
+ const rawMetrics = row.metrics as Record;
+ const offloadMode =
+ row.offload_mode ??
+ (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
return {
+ // Coerce: Postgres bigint comes through the SQL client as a string.
+ id: typeof row.id === 'number' ? row.id : Number(row.id),
hw: row.hardware,
framework: row.framework,
model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
@@ -32,23 +64,43 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
mean_ttft: m.mean_ttft ?? 0,
median_ttft: m.median_ttft ?? 0,
std_ttft: m.std_ttft ?? 0,
+ p75_ttft: m.p75_ttft ?? 0,
+ p90_ttft: m.p90_ttft ?? 0,
+ p95_ttft: m.p95_ttft ?? 0,
p99_ttft: m.p99_ttft ?? 0,
+ 'p99.9_ttft': m['p99.9_ttft'] ?? 0,
mean_tpot: m.mean_tpot ?? 0,
median_tpot: m.median_tpot ?? 0,
std_tpot: m.std_tpot ?? 0,
+ p75_tpot: m.p75_tpot ?? 0,
+ p90_tpot: m.p90_tpot ?? 0,
+ p95_tpot: m.p95_tpot ?? 0,
p99_tpot: m.p99_tpot ?? 0,
+ 'p99.9_tpot': m['p99.9_tpot'] ?? 0,
mean_intvty: m.mean_intvty ?? 0,
median_intvty: m.median_intvty ?? 0,
std_intvty: m.std_intvty ?? 0,
+ p75_intvty: m.p75_intvty ?? 0,
+ p90_intvty: m.p90_intvty ?? 0,
+ p95_intvty: m.p95_intvty ?? 0,
p99_intvty: m.p99_intvty ?? 0,
+ 'p99.9_intvty': m['p99.9_intvty'] ?? 0,
mean_itl: m.mean_itl ?? 0,
median_itl: m.median_itl ?? 0,
std_itl: m.std_itl ?? 0,
+ p75_itl: m.p75_itl ?? 0,
+ p90_itl: m.p90_itl ?? 0,
+ p95_itl: m.p95_itl ?? 0,
p99_itl: m.p99_itl ?? 0,
+ 'p99.9_itl': m['p99.9_itl'] ?? 0,
mean_e2el: m.mean_e2el ?? 0,
median_e2el: m.median_e2el ?? 0,
std_e2el: m.std_e2el ?? 0,
+ p75_e2el: m.p75_e2el ?? 0,
+ p90_e2el: m.p90_e2el ?? 0,
+ p95_e2el: m.p95_e2el ?? 0,
p99_e2el: m.p99_e2el ?? 0,
+ 'p99.9_e2el': m['p99.9_e2el'] ?? 0,
disagg: row.disagg,
num_prefill_gpu: row.num_prefill_gpu,
num_decode_gpu: row.num_decode_gpu,
@@ -68,6 +120,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
date: row.date,
actualDate: (row as any).actualDate ?? row.date,
run_url: row.run_url ?? undefined,
+ benchmark_type: row.benchmark_type,
+ isl: row.isl,
+ osl: row.osl,
+ offload_mode: offloadMode,
+ server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+ server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+ theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+ num_requests_total: m.num_requests_total,
+ num_requests_successful: m.num_requests_successful,
+ total_prompt_tokens: m.total_prompt_tokens,
+ total_generation_tokens: m.total_generation_tokens,
};
}
@@ -77,13 +140,30 @@ interface PreparedEntry {
date: string;
}
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+ return key.replace(/^(mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
+}
+
/**
* Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
* Returns one InferenceData[] per chart definition (e2e, interactivity).
*
* Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ * definition for the chosen percentile — only agentic rows carry the
+ * full set (median/p90/p99/p99.9) so this mainly affects that scenario.
*/
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+ rows: BenchmarkRow[],
+ percentile = 'median',
+): {
chartData: InferenceData[][];
hardwareConfig: HardwareConfig;
} {
@@ -109,13 +189,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
// Phase 2: Build chart data per chart definition (reusing prepared entries)
const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+ const xKey = withPercentile(chartDef.x, percentile);
const groupedByHw: Record = {};
for (const { entry, hwKey, date } of prepared) {
const dataPoint = createChartDataPoint(
date,
entry,
- chartDef.x as keyof AggDataEntry,
+ xKey as keyof AggDataEntry,
chartDef.y as keyof AggDataEntry,
hwKey,
);
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index f0f1ef5b..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
function makeRow(overrides: Partial): BenchmarkRow {
return {
+ id: 1,
hardware: 'h100',
framework: 'sglang',
model: 'dsr1',
@@ -30,6 +31,8 @@ function makeRow(overrides: Partial): BenchmarkRow {
metrics: { tput_per_gpu: 100 },
date: '2026-01-01',
run_url: null,
+ benchmark_type: 'single_turn',
+ offload_mode: 'off',
...overrides,
};
}
diff --git a/packages/app/src/lib/compare-pair-defaults.ts b/packages/app/src/lib/compare-pair-defaults.ts
index be6450ad..f5a37e1f 100644
--- a/packages/app/src/lib/compare-pair-defaults.ts
+++ b/packages/app/src/lib/compare-pair-defaults.ts
@@ -14,6 +14,7 @@ export function pickPairDefaults(
const seenB = new Map>();
for (const row of rows) {
if (row.hardware !== a && row.hardware !== b) continue;
+ if (row.isl === null || row.osl === null) continue;
const seq = islOslToSequence(row.isl, row.osl);
if (!seq) continue;
const key = `${seq}|${row.precision}`;
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index a3d28315..421ac69b 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -61,17 +61,33 @@ export function renderScatterPoints {
+ text
+ .append('tspan')
+ .attr('x', 0)
+ .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+ .text(line);
+ });
+ });
}
// Exit: remove stale points
@@ -128,20 +144,32 @@ export function renderScatterPoints('.point-label')
+ const lines = labelGetter(d).split('\n');
+ const text = d3
+ .select(this)
+ .selectAll('.point-label')
.data([true])
.join('text')
.attr('class', 'point-label')
- .attr('dy', -8)
.attr('text-anchor', 'middle')
.attr('fill', config.foreground!)
.attr('font-size', '10px')
- .attr('pointer-events', 'none')
- .text(config.getLabelText!(d));
+ .attr('font-weight', '700')
+ .attr('pointer-events', 'none');
+ const firstDy = -(0.8 + (lines.length - 1) * 1.1);
+ text
+ .selectAll('tspan')
+ .data(lines)
+ .join('tspan')
+ .attr('x', 0)
+ .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
+ .text((l) => l);
});
} else {
points.selectAll('.point-label').remove();
@@ -261,7 +289,21 @@ export function attachScatterTooltipHandlers<
});
}
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp to container bounds. Tall tooltips that don't fit get
+ * clamped to the container edges.
+ */
export function computeTooltipPosition(
mx: number,
my: number,
@@ -280,13 +322,21 @@ export function computeTooltipPosition(
// Force reflow so we get real dimensions
const tw = node.getBoundingClientRect().width || node.offsetWidth;
const th = node.getBoundingClientRect().height || node.offsetHeight;
+ const rect = container.getBoundingClientRect();
const cw = container.clientWidth;
const ch = container.clientHeight;
+ const EDGE_PAD = 4;
+
+ // Prefer right of cursor; flip to left if no room.
+ let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+ left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
- const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
- const top = my + offset + th > ch ? my - offset - th : my + offset;
+ // Prefer below cursor; flip above if no room.
+ let top = my + offset + th <= ch ? my + offset : my - offset - th;
+ top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
- return { left, top };
+ // Convert container-local coords → viewport coords for `position: fixed`.
+ return { left: left + rect.left, top: top + rect.top };
}
/** Update scatter point positions on zoom. */
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 6a543925..c18266ba 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -138,17 +138,73 @@ export enum Sequence {
OneK_OneK = '1k/1k',
OneK_EightK = '1k/8k',
EightK_OneK = '8k/1k',
+ AgenticTraces = 'agentic-traces',
}
-const SEQUENCE_CONFIG: Record =
- {
- [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
- [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
- [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
- };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+ return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+ Sequence,
+ { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+ [Sequence.OneK_OneK]: {
+ label: '1K / 1K',
+ compact: '1k1k',
+ category: 'default',
+ kind: 'fixed-seq',
+ },
+ [Sequence.OneK_EightK]: {
+ label: '1K / 8K',
+ compact: '1k8k',
+ category: 'deprecated',
+ kind: 'fixed-seq',
+ },
+ [Sequence.EightK_OneK]: {
+ label: '8K / 1K',
+ compact: '8k1k',
+ category: 'default',
+ kind: 'fixed-seq',
+ },
+ [Sequence.AgenticTraces]: {
+ label: 'Agentic Traces',
+ compact: 'agentic',
+ category: 'default',
+ kind: 'agentic',
+ },
+};
export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
+ */
+export enum Percentile {
+ P75 = 'p75',
+ P90 = 'p90',
+}
+
+const PERCENTILE_CONFIG: Record = {
+ [Percentile.P75]: { label: 'p75' },
+ [Percentile.P90]: { label: 'p90' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+ return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
export const DEPRECATED_SEQUENCES: ReadonlySet = new Set(
(Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
.filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,43 @@ function makeEntry(overrides: Partial = {}): AggDataEntry {
mean_ttft: 0.5,
median_ttft: 0.4,
std_ttft: 0.1,
+ p75_ttft: 0.65,
+ p90_ttft: 0.7,
+ p95_ttft: 0.75,
p99_ttft: 0.8,
+ 'p99.9_ttft': 0.9,
mean_tpot: 0.02,
mean_intvty: 45,
median_tpot: 0.02,
median_intvty: 44,
std_tpot: 0.005,
std_intvty: 5,
+ p75_tpot: 0.022,
+ p75_intvty: 50,
+ p90_tpot: 0.025,
+ p90_intvty: 55,
+ p95_tpot: 0.028,
+ p95_intvty: 58,
p99_tpot: 0.03,
p99_intvty: 60,
+ 'p99.9_tpot': 0.035,
+ 'p99.9_intvty': 65,
mean_itl: 0.01,
median_itl: 0.01,
std_itl: 0.002,
+ p75_itl: 0.012,
+ p90_itl: 0.013,
+ p95_itl: 0.014,
p99_itl: 0.015,
+ 'p99.9_itl': 0.018,
mean_e2el: 5,
median_e2el: 4.8,
std_e2el: 0.5,
+ p75_e2el: 5.2,
+ p90_e2el: 5.5,
+ p95_e2el: 5.8,
p99_e2el: 6,
+ 'p99.9_e2el': 6.5,
disagg: false,
num_prefill_gpu: 0,
num_decode_gpu: 0,
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index ebaa5336..73cbe0b7 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,8 +22,10 @@ const URL_STATE_KEYS = [
'i_seq',
'i_prec',
'i_metric',
+ 'i_pctl',
'i_xmetric',
'i_e2e_xmetric',
+ 'i_xmode',
'i_scale',
'i_gpus',
'i_dates',
@@ -66,8 +68,10 @@ export const PARAM_DEFAULTS: Record = {
i_seq: '8k/1k',
i_prec: 'fp4',
i_metric: 'y_tpPerGpu',
- i_xmetric: 'p99_ttft',
- i_e2e_xmetric: '',
+ i_pctl: 'p90',
+ i_xmetric: 'p90_ttft',
+ i_e2e_xmetric: 'p90_ttft',
+ i_xmode: '',
i_scale: 'auto',
i_gpus: '',
i_dates: '',
diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index cc5eb6b4..e23a93bc 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record = {
]),
),
mtp: 'MTP',
+ aiperf: 'AIPerf',
};
/**
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index cf2c4d0b..70e50f96 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,46 +1,110 @@
/**
* Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
*
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
*/
export const METRIC_KEYS = new Set([
// throughput (tokens/sec/GPU)
'tput_per_gpu',
'output_tput_per_gpu',
'input_tput_per_gpu',
+ // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+ 'total_tput_tps',
+ 'output_tput_tps',
+ 'input_tput_tps',
// TTFT — time to first token
'median_ttft',
'mean_ttft',
+ 'p75_ttft',
'p90_ttft',
+ 'p95_ttft',
'p99_ttft',
'p99.9_ttft',
'std_ttft',
// TPOT — time per output token
'median_tpot',
'mean_tpot',
+ 'p75_tpot',
'p90_tpot',
+ 'p95_tpot',
'p99_tpot',
'p99.9_tpot',
'std_tpot',
// ITL — inter-token latency
'median_itl',
'mean_itl',
+ 'p75_itl',
'p90_itl',
+ 'p95_itl',
'p99_itl',
'p99.9_itl',
'std_itl',
// E2EL — end-to-end latency
'median_e2el',
'mean_e2el',
+ 'p75_e2el',
'p90_e2el',
+ 'p95_e2el',
'p99_e2el',
'p99.9_e2el',
'std_e2el',
// interactivity
'median_intvty',
'mean_intvty',
+ 'p75_intvty',
'p90_intvty',
+ 'p95_intvty',
'p99_intvty',
'p99.9_intvty',
'std_intvty',
+ // QPS — queries per second (agentic aiperf)
+ 'median_qps',
+ 'mean_qps',
+ 'p75_qps',
+ 'p90_qps',
+ 'p95_qps',
+ 'p99_qps',
+ 'p99.9_qps',
+ 'std_qps',
+ // per-request input token count distribution
+ 'median_input_tokens',
+ 'mean_input_tokens',
+ 'p75_input_tokens',
+ 'p90_input_tokens',
+ 'p95_input_tokens',
+ 'p99_input_tokens',
+ 'p99.9_input_tokens',
+ 'std_input_tokens',
+ // per-request output token count distribution — actual served
+ 'median_output_tokens_actual',
+ 'mean_output_tokens_actual',
+ 'p75_output_tokens_actual',
+ 'p90_output_tokens_actual',
+ 'p95_output_tokens_actual',
+ 'p99_output_tokens_actual',
+ 'p99.9_output_tokens_actual',
+ 'std_output_tokens_actual',
+ // per-request output token count distribution — expected from trace
+ 'median_output_tokens_expected',
+ 'mean_output_tokens_expected',
+ 'p75_output_tokens_expected',
+ 'p90_output_tokens_expected',
+ 'p95_output_tokens_expected',
+ 'p99_output_tokens_expected',
+ 'p99.9_output_tokens_expected',
+ 'std_output_tokens_expected',
+ // run totals (agentic aiperf)
+ 'duration_seconds',
+ 'total_requests_completed',
+ 'total_prompt_tokens',
+ 'total_generation_tokens',
+ // server prefix-cache observability (agentic aiperf)
+ 'server_gpu_cache_hit_rate',
+ 'server_cpu_cache_hit_rate',
+ 'theoretical_cache_hit_rate',
]);
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index c75034c7..783d239d 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -54,3 +54,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
};
return map[`${isl}_${osl}`] ?? null;
}
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+ isl: number | null;
+ osl: number | null;
+ benchmark_type: string;
+}): string | null {
+ if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+ if (row.isl === null || row.osl === null) return null;
+ return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
new file mode 100644
index 00000000..c143914e
--- /dev/null
+++ b/packages/db/migrations/002_agentic_scenario.sql
@@ -0,0 +1,30 @@
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+ alter column isl drop not null,
+ alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+ drop constraint benchmark_results_isl_positive,
+ drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+ add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+ add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+ drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+ add constraint benchmark_results_unique unique nulls not distinct
+ (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
new file mode 100644
index 00000000..e96cbd50
--- /dev/null
+++ b/packages/db/migrations/003_agentic_availability.sql
@@ -0,0 +1,21 @@
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+ drop constraint availability_pkey;
+
+alter table availability
+ alter column isl drop not null,
+ alter column osl drop not null,
+ add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+ add constraint availability_natural_key unique nulls not distinct
+ (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
new file mode 100644
index 00000000..24b617f1
--- /dev/null
+++ b/packages/db/migrations/004_offload_mode.sql
@@ -0,0 +1,42 @@
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+ add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+ set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+ and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+ drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+ add constraint benchmark_results_unique unique nulls not distinct
+ (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+ br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+ on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
new file mode 100644
index 00000000..398bc725
--- /dev/null
+++ b/packages/db/migrations/006_agentic_trace_replay.sql
@@ -0,0 +1,34 @@
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_` artifact:
+-- - profile_export.jsonl (~2 MB raw, per-request data)
+-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+ id bigserial primary key,
+ -- gzip(profile_export.jsonl); null when only the server metrics file existed
+ profile_export_jsonl_gz bytea,
+ profile_export_uncompressed_size bigint,
+ -- raw csv bytes; null when only the profile file existed
+ server_metrics_csv bytea,
+ server_metrics_csv_size bigint,
+ created_at timestamptz not null default now()
+);
+
+alter table benchmark_results
+ add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+ on benchmark_results (trace_replay_id)
+ where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
new file mode 100644
index 00000000..ba7bd095
--- /dev/null
+++ b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
@@ -0,0 +1,17 @@
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+ add column server_metrics_json_gz bytea,
+ add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
new file mode 100644
index 00000000..d55533b9
--- /dev/null
+++ b/packages/db/migrations/008_agentic_aggregate_stats.sql
@@ -0,0 +1,18 @@
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+ add column aggregate_stats jsonb;
diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
new file mode 100644
index 00000000..b42718b9
--- /dev/null
+++ b/packages/db/migrations/009_agentic_chart_series.sql
@@ -0,0 +1,19 @@
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+ add column chart_series jsonb;
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
new file mode 100644
index 00000000..756b775e
--- /dev/null
+++ b/packages/db/migrations/010_agentic_request_timeline.sql
@@ -0,0 +1,15 @@
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+ add column request_timeline jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index c849ea26..710089f1 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,9 @@
"db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
"db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
"db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+ "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+ "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+ "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
"db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
"db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
"db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
@@ -30,11 +33,14 @@
"@neondatabase/serverless": "^1.1.0",
"@noble/ciphers": "^2.2.0",
"@semianalysisai/inferencex-constants": "workspace:*",
- "postgres": "^3.4.9"
+ "postgres": "^3.4.9",
+ "stream-chain": "^3.4.0",
+ "stream-json": "^2.1.0"
},
"devDependencies": {
"@types/adm-zip": "^0.5.8",
"@types/node": "^25.7.0",
+ "@types/stream-json": "^1.7.8",
"@vitest/coverage-v8": "^4.1.6",
"adm-zip": "^0.5.17",
"dotenv-cli": "^11.0.0",
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..8dd42dce
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,150 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after
+ * applying migration 008 and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ * - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ * MB decompressed for TP+EP / high-conc points — keeping one in memory
+ * at a time avoids OOM).
+ * - Skip rows whose stored `aggregate_stats.version` already matches.
+ * - Recompute via the same `computeAggregateStats()` helper the ingest
+ * path uses, so behavior cannot drift.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ * [--limit N] only process the first N candidate rows (useful for
+ * smoke-tests on a fresh deploy)
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+ limit: number | null;
+ force: boolean;
+}
+
+function parseFlags(): CliFlags {
+ let limit: number | null = null;
+ let force = false;
+ for (let i = 2; i < process.argv.length; i++) {
+ const arg = process.argv[i]!;
+ if (arg === '--force') force = true;
+ else if (arg === '--limit') {
+ const next = process.argv[++i];
+ if (!next || Number.isNaN(Number(next))) {
+ console.error('--limit requires a numeric argument');
+ process.exit(1);
+ }
+ limit = Number(next);
+ }
+ }
+ return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-aggregate-stats ===');
+ console.log(` STATS_VERSION = ${STATS_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Find candidates: rows missing stats, or whose stored version is stale.
+ // Using >>'version'::int comparison would error on null; coalesce to -1 so
+ // null-stats rows always count as stale.
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where aggregate_stats is null
+ or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ console.log(`\n ${candidates.length} candidate row(s).`);
+ if (!hasYesFlag()) {
+ const ok = await confirm('\nProceed? (y/N) ');
+ if (!ok) {
+ console.log('Aborted.');
+ return;
+ }
+ }
+
+ let ok = 0;
+ let failed = 0;
+ const t0 = Date.now();
+ for (const { id } of candidates) {
+ const start = Date.now();
+ try {
+ // Fetch one row at a time — the json_gz blob is the heavy field.
+ const [row] = await sql<
+ { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[]
+ >`
+ select profile_export_jsonl_gz, server_metrics_json_gz
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ continue;
+ }
+
+ const stats = await computeAggregateStats({
+ profileBlob: row.profile_export_jsonl_gz,
+ serverBlob: row.server_metrics_json_gz,
+ });
+
+ await sql`
+ update agentic_trace_replay
+ set aggregate_stats = ${sql.json(structuredClone(stats) as unknown as Parameters[0])}
+ where id = ${id}
+ `;
+ ok++;
+ const elapsed = Math.round((Date.now() - start) / 1000);
+ const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+ console.log(
+ ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+ );
+ } catch (error) {
+ failed++;
+ console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+ }
+ }
+
+ const totalSec = Math.round((Date.now() - t0) / 1000);
+ console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+ if (failed > 0) process.exitCode = 1;
+}
+
+main()
+ .catch((error) => {
+ console.error('backfill-aggregate-stats failed:', error);
+ process.exitCode = 1;
+ })
+ .finally(() => sql.end());
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..66156b45
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,154 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ * - Stream rows one at a time (server_metrics_json_gz can decompress
+ * past 500 MB on high-conc TP+EP points — one in memory at a time
+ * avoids OOM).
+ * - Skip rows whose stored version already matches.
+ * - Recompute via the same `computeChartSeries()` helper the ingest
+ * path uses, so behavior cannot drift.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ * [--limit N] only process the first N candidate rows
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+ limit: number | null;
+ force: boolean;
+}
+
+function parseFlags(): CliFlags {
+ let limit: number | null = null;
+ let force = false;
+ for (let i = 2; i < process.argv.length; i++) {
+ const arg = process.argv[i]!;
+ if (arg === '--force') force = true;
+ else if (arg === '--limit') {
+ const next = process.argv[++i];
+ if (!next || Number.isNaN(Number(next))) {
+ console.error('--limit requires a numeric argument');
+ process.exit(1);
+ }
+ limit = Number(next);
+ }
+ }
+ return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-chart-series ===');
+ console.log(` CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Only rows that actually have a server_metrics blob can produce a
+ // chart_series. Rows without the blob legitimately keep `chart_series`
+ // null and the API serves them via the slow path (which also returns
+ // null because there's no blob to parse — so the page falls into the
+ // "no stored trace_replay blob" branch).
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where server_metrics_json_gz is not null
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where server_metrics_json_gz is not null
+ and (
+ chart_series is null
+ or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+ )
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ console.log(`\n ${candidates.length} candidate row(s).`);
+ if (!hasYesFlag()) {
+ const ok = await confirm('\nProceed? (y/N) ');
+ if (!ok) {
+ console.log('Aborted.');
+ return;
+ }
+ }
+
+ let ok = 0;
+ let failed = 0;
+ const t0 = Date.now();
+ for (const { id } of candidates) {
+ const start = Date.now();
+ try {
+ const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+ select server_metrics_json_gz
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ continue;
+ }
+
+ const series = await computeChartSeries(row.server_metrics_json_gz);
+
+ await sql`
+ update agentic_trace_replay
+ set chart_series = ${
+ series === null
+ ? null
+ : sql.json(structuredClone(series) as unknown as Parameters[0])
+ }
+ where id = ${id}
+ `;
+ ok++;
+ const elapsed = Math.round((Date.now() - start) / 1000);
+ const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+ console.log(
+ ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+ );
+ } catch (error) {
+ failed++;
+ console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+ }
+ }
+
+ const totalSec = Math.round((Date.now() - t0) / 1000);
+ console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+ if (failed > 0) process.exitCode = 1;
+}
+
+main()
+ .catch((error) => {
+ console.error('backfill-chart-series failed:', error);
+ process.exitCode = 1;
+ })
+ .finally(() => sql.end());
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..327099d0
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,144 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 010 and any time the version bumps.
+ *
+ * Usage:
+ * pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ * [--limit N] only process the first N candidate rows
+ * [--force] recompute every row, even if version already matches
+ * [--yes] skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import {
+ REQUEST_TIMELINE_VERSION,
+ computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+ limit: number | null;
+ force: boolean;
+}
+
+function parseFlags(): CliFlags {
+ let limit: number | null = null;
+ let force = false;
+ for (let i = 2; i < process.argv.length; i++) {
+ const arg = process.argv[i]!;
+ if (arg === '--force') force = true;
+ else if (arg === '--limit') {
+ const next = process.argv[++i];
+ if (!next || Number.isNaN(Number(next))) {
+ console.error('--limit requires a numeric argument');
+ process.exit(1);
+ }
+ limit = Number(next);
+ }
+ }
+ return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+ noSsl: hasNoSslFlag(),
+ max: 1,
+ onnotice: () => {},
+});
+
+async function main(): Promise {
+ console.log('=== backfill-request-timeline ===');
+ console.log(` REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+ console.log(` force = ${flags.force}`);
+ console.log(` limit = ${flags.limit ?? 'none'}`);
+
+ // Only rows with a profile_export blob can produce a timeline. Rows
+ // without the blob keep `request_timeline` null and the API serves them
+ // as "no timeline data".
+ const candidates = flags.force
+ ? await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where profile_export_jsonl_gz is not null
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `
+ : await sql<{ id: number }[]>`
+ select id
+ from agentic_trace_replay
+ where profile_export_jsonl_gz is not null
+ and (
+ request_timeline is null
+ or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+ )
+ order by id
+ ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+ `;
+
+ if (candidates.length === 0) {
+ console.log('\n Nothing to do — all rows up to date.');
+ return;
+ }
+
+ console.log(`\n ${candidates.length} candidate row(s).`);
+ if (!hasYesFlag()) {
+ const ok = await confirm('\nProceed? (y/N) ');
+ if (!ok) {
+ console.log('Aborted.');
+ return;
+ }
+ }
+
+ let ok = 0;
+ let failed = 0;
+ const t0 = Date.now();
+ for (const { id } of candidates) {
+ const start = Date.now();
+ try {
+ const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+ select profile_export_jsonl_gz
+ from agentic_trace_replay
+ where id = ${id}
+ `;
+ if (!row) {
+ console.warn(` id=${id}: row vanished, skipping`);
+ continue;
+ }
+ const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+ await sql`
+ update agentic_trace_replay
+ set request_timeline = ${
+ timeline === null
+ ? null
+ : sql.json(structuredClone(timeline) as unknown as Parameters[0])
+ }
+ where id = ${id}
+ `;
+ ok++;
+ const elapsed = Math.round((Date.now() - start) / 1000);
+ const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+ console.log(
+ ` ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+ );
+ } catch (error) {
+ failed++;
+ console.error(` ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+ }
+ }
+
+ const totalSec = Math.round((Date.now() - t0) / 1000);
+ console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+ if (failed > 0) process.exitCode = 1;
+}
+
+main()
+ .catch((error) => {
+ console.error('backfill-request-timeline failed:', error);
+ process.exitCode = 1;
+ })
+ .finally(() => sql.end());
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index 67173c64..ea802d3f 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows(
// Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
// more than once in a single batch. Deduplicate within the batch, keeping
- // the last occurrence (last metrics for each unique config/isl/osl/conc).
+ // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
const seen = new Map();
- for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+ for (const r of rows) {
+ seen.set(
+ `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+ r,
+ );
+ }
const deduped = [...seen.values()];
const configIds = deduped.map((r) => r.configId);
+ const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+ const offloadModes = deduped.map((r) => r.offloadMode);
const isls = deduped.map((r) => r.isl);
const osls = deduped.map((r) => r.osl);
const concs = deduped.map((r) => r.conc);
@@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows(
const result = await sql<{ inserted: boolean; id: number }[]>`
insert into benchmark_results (
- workflow_run_id, config_id, benchmark_type, date,
+ workflow_run_id, config_id, benchmark_type, offload_mode, date,
isl, osl, conc, image, metrics
)
select
${workflowRunId},
unnest(${sql.array(configIds)}::int[]),
- 'single_turn',
+ unnest(${sql.array(benchmarkTypes)}::text[]),
+ unnest(${sql.array(offloadModes)}::text[]),
${date}::date,
unnest(${sql.array(isls)}::int[]),
unnest(${sql.array(osls)}::int[]),
unnest(${sql.array(concs)}::int[]),
unnest(${sql.array(images)}),
unnest(${sql.array(metricsJsons)}::jsonb[])
- on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+ on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
do update set
metrics = excluded.metrics,
image = excluded.image
@@ -147,13 +155,14 @@ export async function bulkUpsertAvailability(
sql: Sql,
rows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[],
date: string,
): Promise {
@@ -162,7 +171,7 @@ export async function bulkUpsertAvailability(
const seen = new Set();
const unique: typeof rows = [];
for (const r of rows) {
- const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+ const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
if (!seen.has(key)) {
seen.add(key);
unique.push(r);
@@ -170,7 +179,7 @@ export async function bulkUpsertAvailability(
}
await sql`
- insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+ insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
select
unnest(${sql.array(unique.map((r) => r.model))}::text[]),
unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -180,6 +189,7 @@ export async function bulkUpsertAvailability(
unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+ unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
${date}::date
on conflict do nothing
`;
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 7d78e175..1aff5ea9 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([
'decode_num_workers',
'num_prefill_gpu',
'num_decode_gpu',
+ // agentic scenario
+ 'scenario_type',
+ 'users',
+ 'offload_mode',
+ 'num_requests_total',
+ 'num_requests_successful',
]);
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn` — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
/**
* METRIC_KEYS from constants is the canonical set of known metric keys.
* Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set();
export interface BenchmarkParams {
config: ConfigParams;
- isl: number;
- osl: number;
+ benchmarkType: BenchmarkType;
+ // Null for agentic_traces; present for single_turn.
+ isl: number | null;
+ osl: number | null;
conc: number;
+ /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+ offloadMode: string;
image: string | null;
metrics: Record;
}
@@ -114,14 +131,45 @@ export function mapBenchmarkRow(
return null;
}
- const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
- const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
- const conc = parseInt2(row.conc);
- if (!isl || !osl || !conc) {
+ // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+ // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+ const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+ const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+ const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+ const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+ // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+ const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
+ if (!conc || (!isAgentic && (!isl || !osl))) {
tracker.skips.noIslOsl++;
return null;
}
+ // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+ // every runner, including ones with 0 successful requests and null metrics.
+ // Without this skip, the empty row's nulls overwrite a good row via
+ // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+ if (
+ typeof row.num_requests_successful === 'number' &&
+ row.num_requests_successful === 0 &&
+ typeof row.num_requests_total === 'number' &&
+ row.num_requests_total > 0
+ ) {
+ tracker.skips.failedRun++;
+ return null;
+ }
+
+ // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
+ // ('none' → 'off'; any other non-empty value → 'on').
+ const offloadModeRaw =
+ typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+ ? row.offload_mode
+ : typeof row.offloading === 'string' && row.offloading.length > 0
+ ? row.offloading === 'none'
+ ? 'off'
+ : 'on'
+ : 'off';
+
const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
const isMultinode = parseBool(row.is_multinode);
const precision = normalizePrecision(String(row.precision ?? ''));
@@ -182,6 +230,12 @@ export function mapBenchmarkRow(
}
}
+ // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+ // — preserve as a stringified metric so the frontend can expose it in tooltips.
+ if (isAgentic) {
+ (metrics as Record).offload_mode = offloadModeRaw;
+ }
+
// Artifact names encode '/' as '#' to avoid path separators; restore the URI.
const image = row.image ? String(row.image).replaceAll('#', '/') : null;
@@ -205,9 +259,11 @@ export function mapBenchmarkRow(
numPrefillGpu,
numDecodeGpu,
},
+ benchmarkType,
isl,
osl,
conc,
+ offloadMode: offloadModeRaw,
image,
metrics,
};
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..de0009de
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,123 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+ const lines = requests.map((r, i) =>
+ JSON.stringify({
+ metadata: {
+ benchmark_phase: 'profiling',
+ conversation_id: `conv-${i}`,
+ turn_index: 0,
+ },
+ metrics: {
+ input_sequence_length: { value: r.isl, unit: 'tokens' },
+ output_sequence_length: { value: r.osl, unit: 'tokens' },
+ request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+ time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+ },
+ }),
+ );
+ return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1, avg: 0.2 },
+ { start_ns: 1, end_ns: 2, avg: 0.5 },
+ { start_ns: 2, end_ns: 3, avg: 0.8 },
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_hits': {
+ series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+ },
+ 'vllm:prefix_cache_queries': {
+ series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+ },
+ },
+ });
+ return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+ it('returns the current STATS_VERSION in the bundle', async () => {
+ const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+ expect(stats.version).toBe(STATS_VERSION);
+ });
+
+ it('leaves every metric null when both blobs are null', async () => {
+ const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.kvCacheUtil).toBeNull();
+ expect(stats.prefixCacheHitRate).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ });
+
+ it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+ const profileBlob = makeProfileBlob([
+ { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+ { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+ { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+ ]);
+ const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+ expect(stats.isl?.n).toBe(3);
+ expect(stats.isl?.mean).toBeCloseTo(200, 6);
+ expect(stats.osl?.n).toBe(3);
+ expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+ // Server-side metrics still null when there's no server blob.
+ expect(stats.kvCacheUtil).toBeNull();
+ expect(stats.prefixCacheHitRate).toBeNull();
+
+ // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+ expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+ // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+ // loads = [150, 275, 400], mean_load = 275
+ // scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+ // mean ≈ 1.9653
+ expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+ });
+
+ it('computes KV util + prefix hit rate from the server blob alone', async () => {
+ const stats = await computeAggregateStats({
+ profileBlob: null,
+ serverBlob: makeServerBlob(),
+ });
+ expect(stats.kvCacheUtil?.n).toBe(3);
+ expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+ expect(stats.prefixCacheHitRate?.n).toBe(1);
+ expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+ // Profile-derived metrics absent.
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ });
+
+ it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+ // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+ const garbage = Buffer.from('not-gzip-data');
+ const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+ expect(stats.isl).toBeNull();
+ expect(stats.osl).toBeNull();
+ expect(stats.normalizedSessionTimeS).toBeNull();
+ expect(stats.p90PrefillTpsPerUser).toBeNull();
+ // Version still set so the row is considered "computed".
+ expect(stats.version).toBe(STATS_VERSION);
+ });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..a422cfec
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,147 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics.js';
+import {
+ STATS_VERSION,
+ extractIslOsl,
+ extractServerMetricSamples,
+ percentilesOf,
+ type MetricPercentiles,
+} from '../queries/agentic-aggregates.js';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+ version: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+ /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+ normalizedSessionTimeS: number | null;
+ /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+ p90PrefillTpsPerUser: number | null;
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc',
+ 'vllm:prefix_cache_hits',
+ 'vllm:prefix_cache_queries',
+ 'vllm:gpu_prefix_cache_hits',
+ 'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+ buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+ /* eslint-disable @typescript-eslint/no-explicit-any */
+ const collected: Record = {};
+ const pipelineStream = chain([
+ Readable.from(buffer),
+ createGunzip(),
+ parser(),
+ pick({ filter: 'metrics' }),
+ streamObject(),
+ ]);
+ await new Promise((resolve, reject) => {
+ (pipelineStream as any).on('data', (chunk: unknown) => {
+ const { key, value } = chunk as { key: string; value: unknown };
+ if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+ });
+ (pipelineStream as any).on('end', resolve);
+ (pipelineStream as any).on('error', reject);
+ });
+ /* eslint-enable @typescript-eslint/no-explicit-any */
+ return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+ profileBlob: Buffer | null;
+ serverBlob: Buffer | null;
+}): Promise {
+ let islPct: MetricPercentiles | null = null;
+ let oslPct: MetricPercentiles | null = null;
+ let normalized: number | null = null;
+ let prefillP90: number | null = null;
+
+ if (args.profileBlob) {
+ try {
+ const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+ const { isl, osl } = extractIslOsl(jsonl);
+ islPct = percentilesOf(isl);
+ oslPct = percentilesOf(osl);
+ const derived = computeDerivedFromBlob(jsonl);
+ normalized = derived.normalized_session_time_s;
+ prefillP90 = derived.p90_prefill_tps_per_user;
+ } catch {
+ // ignore malformed blob — leave nulls
+ }
+ }
+
+ let kvPct: MetricPercentiles | null = null;
+ let prefixPct: MetricPercentiles | null = null;
+ if (args.serverBlob) {
+ let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+ try {
+ const json = gunzipSync(args.serverBlob).toString('utf8');
+ server = extractServerMetricSamples(json);
+ } catch (error) {
+ const code = error && (error as NodeJS.ErrnoException).code;
+ const msg = error instanceof Error ? error.message : String(error);
+ // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+ // pull just the metric subtrees we need without materializing the
+ // full 500+ MB JSON string.
+ if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+ try {
+ server = await streamExtractServer(args.serverBlob);
+ } catch {
+ // stream fallback failed too — leave nulls
+ }
+ }
+ }
+ if (server) {
+ kvPct = percentilesOf(server.kvCacheUtil);
+ prefixPct = percentilesOf(server.prefixCacheHitRate);
+ }
+ }
+
+ return {
+ version: STATS_VERSION,
+ isl: islPct,
+ osl: oslPct,
+ kvCacheUtil: kvPct,
+ prefixCacheHitRate: prefixPct,
+ normalizedSessionTimeS: normalized,
+ p90PrefillTpsPerUser: prefillP90,
+ };
+}
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..4c6f8791
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,209 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+ prefixHits?: number;
+ prefixQueries?: number;
+ promptTokensRate?: number;
+}) {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+ { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+ { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_hits': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+ },
+ 'vllm:prefix_cache_queries': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+ },
+ 'vllm:num_requests_running': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+ },
+ 'vllm:num_requests_waiting': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+ },
+ 'vllm:prompt_tokens': {
+ series: [
+ { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+ ],
+ },
+ 'vllm:generation_tokens': {
+ series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+ },
+ 'vllm:prompt_tokens_by_source': {
+ series: [
+ {
+ labels: { source: 'local_cache_hit' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+ },
+ {
+ labels: { source: 'miss' },
+ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+ },
+ ],
+ },
+ },
+ });
+ return gzipSync(Buffer.from(json));
+}
+
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+ const labels = { engine: String(engineId) };
+ return {
+ runningSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: baseRunning },
+ { start_ns: 1e9, avg: baseRunning + 1 },
+ ],
+ },
+ waitingSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: 0 },
+ { start_ns: 1e9, avg: 0 },
+ ],
+ },
+ kvSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, avg: 0.25 },
+ { start_ns: 1e9, avg: 0.5 },
+ ],
+ },
+ promptSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, rate: 100 },
+ { start_ns: 1e9, rate: 200 },
+ ],
+ },
+ genSlice: {
+ labels,
+ timeslices: [
+ { start_ns: 0, rate: 50 },
+ { start_ns: 1e9, rate: 75 },
+ ],
+ },
+ };
+}
+
+describe('computeChartSeries', () => {
+ it('returns null when the blob is null', async () => {
+ expect(await computeChartSeries(null)).toBeNull();
+ });
+
+ it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.version).toBe(CHART_SERIES_VERSION);
+ });
+
+ it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.kvCacheUsage).toEqual([
+ { t: 0, value: 0.1 },
+ { t: 1, value: 0.4 },
+ { t: 2, value: 0.7 },
+ ]);
+ });
+
+ it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+ const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+ expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+ });
+
+ it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+ const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+ expect(series?.prefixCacheHitRate).toEqual([]);
+ });
+
+ it('pairs running + waiting into queueDepth points', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+ });
+
+ it('extracts prefillTps + decodeTps from counter rates', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+ expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+ });
+
+ it('splits promptTokensBySource by label and skips empty series', async () => {
+ const series = await computeChartSeries(makeBlob());
+ expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+ 'local_cache_hit',
+ 'miss',
+ ]);
+ expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+ expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+ });
+
+ it('computes timing metadata from the widest metric window', async () => {
+ const series = await computeChartSeries(makeBlob());
+ // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+ expect(series?.startNs).toBe(0);
+ expect(series?.endNs).toBe(3e9);
+ expect(series?.durationS).toBeCloseTo(3, 6);
+ expect(series?.timeslicesCount).toBe(3);
+ });
+
+ it('returns null on a malformed (non-gzip) blob', async () => {
+ const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+ expect(result).toBeNull();
+ });
+
+ it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+ // Simulate a 4-engine deployment: each engine reports its own series for
+ // every metric. Cluster-wide value should be SUM for running/waiting and
+ // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+ const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+ 'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+ 'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+ 'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+ 'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+ },
+ });
+ const blob = gzipSync(Buffer.from(json));
+ const cs = await computeChartSeries(blob);
+ expect(cs).not.toBeNull();
+ // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+ expect(cs!.queueDepth).toEqual([
+ { t: 0, running: 12, waiting: 0, total: 12 },
+ { t: 1, running: 16, waiting: 0, total: 16 },
+ ]);
+ // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+ expect(cs!.kvCacheUsage).toEqual([
+ { t: 0, value: 0.25 },
+ { t: 1, value: 0.5 },
+ ]);
+ // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+ expect(cs!.prefillTps).toEqual([
+ { t: 0, value: 400 },
+ { t: 1, value: 800 },
+ ]);
+ expect(cs!.decodeTps).toEqual([
+ { t: 0, value: 200 },
+ { t: 1, value: 300 },
+ ]);
+ });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..530600cf
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,290 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ */
+export const CHART_SERIES_VERSION = 2;
+
+export interface TimeSeriesPoint {
+ /** Seconds from benchmark start. */
+ t: number;
+ value: number;
+}
+
+export interface QueueDepthPoint {
+ t: number;
+ running: number;
+ waiting: number;
+ total: number;
+}
+
+export interface ChartSeries {
+ version: number;
+ /** ns wall-clock of the first window's start; for debugging only. */
+ startNs: number;
+ /** ns wall-clock of the last window's end. */
+ endNs: number;
+ /** Total benchmark window in seconds. */
+ durationS: number;
+ /** Number of 1Hz windows captured. */
+ timeslicesCount: number;
+ kvCacheUsage: TimeSeriesPoint[];
+ prefixCacheHitRate: TimeSeriesPoint[];
+ queueDepth: QueueDepthPoint[];
+ promptTokensBySource: Record;
+ prefillTps: TimeSeriesPoint[];
+ decodeTps: TimeSeriesPoint[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+ start_ns?: number;
+ end_ns?: number;
+ avg?: number;
+ rate?: number;
+}
+
+interface RawSeries {
+ labels?: Record;
+ timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+ series?: RawSeries[];
+}
+
+type MetricsMap = Record;
+
+/** The set of metric subtrees the chart consumes. */
+const CHART_METRIC_KEYS = new Set([
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc',
+ 'vllm:prefix_cache_hits',
+ 'vllm:prefix_cache_queries',
+ 'vllm:num_requests_running',
+ 'vllm:num_requests_waiting',
+ 'vllm:prompt_tokens',
+ 'vllm:generation_tokens',
+ 'vllm:prompt_tokens_by_source',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect only the metric
+ * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise {
+ /* eslint-disable @typescript-eslint/no-explicit-any */
+ const collected: MetricsMap = {};
+ const pipeline = chain([
+ Readable.from(buffer),
+ createGunzip(),
+ parser(),
+ pick({ filter: 'metrics' }),
+ streamObject(),
+ ]);
+ await new Promise((resolve, reject) => {
+ (pipeline as any).on('data', (chunk: unknown) => {
+ const { key, value } = chunk as { key: string; value: RawMetric };
+ if (CHART_METRIC_KEYS.has(key)) collected[key] = value;
+ });
+ (pipeline as any).on('end', resolve);
+ (pipeline as any).on('error', reject);
+ });
+ /* eslint-enable @typescript-eslint/no-explicit-any */
+ return collected;
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed.
+ */
+async function parseMetrics(buffer: Buffer): Promise {
+ try {
+ const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap };
+ return obj.metrics ?? {};
+ } catch (error) {
+ const code = error && (error as NodeJS.ErrnoException).code;
+ const msg = error instanceof Error ? error.message : String(error);
+ if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+ return await streamCollectMetrics(buffer);
+ }
+ throw error;
+ }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(blob: Buffer | null): Promise {
+ if (!blob) return null;
+ let metrics: MetricsMap;
+ try {
+ metrics = await parseMetrics(blob);
+ } catch {
+ // Malformed blob → no series (caller treats null as "no data").
+ return null;
+ }
+ return buildSeriesFromMetrics(metrics);
+}
+
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+ series: readonly RawSeries[] | undefined,
+ field: 'avg' | 'rate',
+ combine: 'sum' | 'avg',
+): Map {
+ const sums = new Map();
+ const counts = new Map();
+ for (const s of series ?? []) {
+ for (const ts of s.timeslices ?? []) {
+ if (typeof ts.start_ns !== 'number') continue;
+ const v = ts[field];
+ if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+ sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+ counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+ }
+ }
+ if (combine === 'sum') return sums;
+ const out = new Map();
+ for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+ return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map): [number, number][] {
+ return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
+}
+
+function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
+ // Timing reference: smallest start_ns and largest end_ns across every
+ // timeslice we extracted. timeslicesCount is the length of any single
+ // series (engines are scraped on the same cadence), so picking the max
+ // length across all series of all metrics is safe.
+ let startNs = Number.POSITIVE_INFINITY;
+ let endNs = 0;
+ let timeslicesCount = 0;
+ for (const metricMeta of Object.values(metrics)) {
+ for (const s of metricMeta?.series ?? []) {
+ const ts = s.timeslices ?? [];
+ if (ts.length === 0) continue;
+ timeslicesCount = Math.max(timeslicesCount, ts.length);
+ const first = ts[0]!;
+ const last = ts.at(-1)!;
+ if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+ if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+ }
+ }
+ if (!Number.isFinite(startNs)) startNs = 0;
+ const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+ // KV cache usage (gauge, 0..1) — average across engines so the value
+ // stays a fraction (each engine has its own KV pool).
+ const kvSeries =
+ metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+ const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+ aggregateByStart(kvSeries, 'avg', 'avg'),
+ ).map(([t, v]) => ({ t: tOf(t), value: v }));
+
+ // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+ // engines, joined on start_ns.
+ const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
+ const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
+ const prefixCacheHitRate: TimeSeriesPoint[] = [];
+ for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+ const q = qsByT.get(t);
+ if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
+ }
+
+ // Queue depth: sum running + waiting across engines per timeslice.
+ const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
+ const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
+ const queueDepth: QueueDepthPoint[] = [];
+ // Union of timestamps so we surface activity even if one of the gauges
+ // didn't report a sample on a given tick.
+ const allTimes = new Set([...runByT.keys(), ...waitByT.keys()]);
+ for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+ const running = runByT.get(t) ?? 0;
+ const waiting = waitByT.get(t) ?? 0;
+ queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
+ }
+
+ // Throughput: sum the counter `rate` (already per-second) across engines.
+ const counterRate = (name: string): TimeSeriesPoint[] =>
+ sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+ t: tOf(t),
+ value: v,
+ }));
+ const prefillTps = counterRate('vllm:prompt_tokens');
+ const decodeTps = counterRate('vllm:generation_tokens');
+
+ // Per-source prompt tokens — sum across engines per source label.
+ const promptBySrcByT = new Map>();
+ for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+ const labels = series.labels ?? {};
+ const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+ let byT = promptBySrcByT.get(source);
+ if (!byT) {
+ byT = new Map();
+ promptBySrcByT.set(source, byT);
+ }
+ for (const ts of series.timeslices ?? []) {
+ if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+ byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+ }
+ }
+ }
+ const promptTokensBySource: Record = {};
+ for (const [source, byT] of promptBySrcByT) {
+ const arr: TimeSeriesPoint[] = [];
+ for (const [t, v] of [...byT.entries()].toSorted((a, b) => a[0] - b[0])) {
+ if (v > 0) arr.push({ t: tOf(t), value: v });
+ }
+ if (arr.length > 0) promptTokensBySource[source] = arr;
+ }
+ return {
+ version: CHART_SERIES_VERSION,
+ startNs,
+ endNs,
+ durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+ timeslicesCount,
+ kvCacheUsage,
+ prefixCacheHitRate,
+ queueDepth,
+ promptTokensBySource,
+ prefillTps,
+ decodeTps,
+ };
+}
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..64512aca
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,153 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+ cid: string;
+ ti: number;
+ wid?: string;
+ ad?: number;
+ phase?: string;
+ credit: number;
+ start: number;
+ end: number;
+ ack?: number | null;
+ ttftMs?: number | null;
+ isl?: number | null;
+ osl?: number | null;
+ cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+ const lines = requests.map((r) =>
+ JSON.stringify({
+ metadata: {
+ conversation_id: r.cid,
+ turn_index: r.ti,
+ worker_id: r.wid ?? 'worker_default',
+ agent_depth: r.ad ?? 0,
+ benchmark_phase: r.phase ?? 'profiling',
+ credit_issued_ns: r.credit,
+ request_start_ns: r.start,
+ ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+ request_end_ns: r.end,
+ was_cancelled: r.cancelled ?? false,
+ },
+ metrics: {
+ time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+ input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+ output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+ },
+ }),
+ );
+ return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+ it('returns null when the blob is null', () => {
+ expect(computeRequestTimeline(null)).toBeNull();
+ });
+
+ it('returns null on a malformed (non-gzip) blob', () => {
+ expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+ });
+
+ it('returns null when the blob has no parseable records', () => {
+ expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+ });
+
+ it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+ );
+ expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+ });
+
+ it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+ // Two requests with absolute ns starting at 1_000_000_000.
+ const tl = computeRequestTimeline(
+ makeBlob([
+ { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+ { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+ ]),
+ );
+ expect(tl?.startNs).toBe(1_000_000_000);
+ expect(tl?.endNs).toBe(1_030_000_000);
+ expect(tl?.durationS).toBeCloseTo(0.03, 6);
+ expect(tl?.requests[0]?.credit).toBe(0);
+ expect(tl?.requests[0]?.end).toBe(10_000_000);
+ expect(tl?.requests[1]?.start).toBe(21_000_000);
+ });
+
+ it('sorts requests by start time, regardless of input order', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+ { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+ { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+ ]),
+ );
+ expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+ });
+
+ it('preserves conversation/worker grouping fields', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'conv-A',
+ ti: 5,
+ wid: 'worker_abcd1234',
+ ad: 2,
+ phase: 'profiling',
+ credit: 0,
+ start: 10,
+ end: 100,
+ },
+ ]),
+ );
+ const r = tl?.requests[0]!;
+ expect(r.cid).toBe('conv-A');
+ expect(r.ti).toBe(5);
+ expect(r.wid).toBe('worker_abcd1234');
+ expect(r.ad).toBe(2);
+ expect(r.phase).toBe('profiling');
+ });
+
+ it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+ const tl = computeRequestTimeline(
+ makeBlob([
+ {
+ cid: 'a',
+ ti: 0,
+ credit: 0,
+ start: 10,
+ end: 100,
+ ttftMs: 25.5,
+ isl: 1024,
+ osl: 256,
+ cancelled: true,
+ },
+ ]),
+ );
+ const r = tl?.requests[0]!;
+ expect(r.cancelled).toBe(true);
+ expect(r.ttftMs).toBeCloseTo(25.5, 6);
+ expect(r.isl).toBe(1024);
+ expect(r.osl).toBe(256);
+ });
+
+ it('skips records missing both credit_issued_ns and request_start_ns', () => {
+ // Build a record with only request_end_ns — the helper rejects it.
+ const broken = gzipSync(
+ Buffer.from(
+ JSON.stringify({
+ metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+ metrics: {},
+ }),
+ ),
+ );
+ expect(computeRequestTimeline(broken)).toBeNull();
+ });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..a1134f7a
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,182 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 1;
+
+export interface RequestRecord {
+ /** Conversation id (groups turns of one agent session). */
+ cid: string;
+ /** Zero-based turn index within the conversation. */
+ ti: number;
+ /** Worker id (concurrency slot that handled this request). */
+ wid: string;
+ /** Sub-agent depth (0 = top-level). */
+ ad: number;
+ /** `warmup` or `profiling`. */
+ phase: string;
+ /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+ credit: number;
+ /** ns offset from timeline.startNs. HTTP send started. */
+ start: number;
+ /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+ ack: number | null;
+ /** ns offset from timeline.startNs. Last byte received. */
+ end: number;
+ /** Time-to-first-token in ms. */
+ ttftMs: number | null;
+ /** Input sequence length (tokens). */
+ isl: number | null;
+ /** Output sequence length (tokens). */
+ osl: number | null;
+ cancelled: boolean;
+}
+
+export interface RequestTimeline {
+ version: number;
+ /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+ startNs: number;
+ /** Wall-clock ns of the latest `request_end_ns`. */
+ endNs: number;
+ /** Total span in seconds. */
+ durationS: number;
+ requests: RequestRecord[];
+}
+
+interface RawMetadata {
+ conversation_id?: string;
+ turn_index?: number;
+ worker_id?: string;
+ agent_depth?: number;
+ benchmark_phase?: string;
+ credit_issued_ns?: number;
+ request_start_ns?: number;
+ request_ack_ns?: number;
+ request_end_ns?: number;
+ was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+ value?: number;
+}
+
+interface RawRecord {
+ metadata?: RawMetadata;
+ metrics?: {
+ time_to_first_token?: RawMetricValue | number;
+ input_sequence_length?: RawMetricValue | number;
+ output_sequence_length?: RawMetricValue | number;
+ };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+ if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+ if (v && typeof v === 'object' && 'value' in v) {
+ const inner = (v as { value?: unknown }).value;
+ if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+ }
+ return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+ if (!blob) return null;
+ let text: string;
+ try {
+ text = gunzipSync(blob).toString('utf8');
+ } catch {
+ return null;
+ }
+
+ // First pass: parse + collect raw turns; find timeline origin.
+ const raw: {
+ meta: RawMetadata;
+ ttftMs: number | null;
+ isl: number | null;
+ osl: number | null;
+ }[] = [];
+ let originNs = Number.POSITIVE_INFINITY;
+ let endNs = 0;
+
+ for (const line of text.split('\n')) {
+ if (!line) continue;
+ let rec: RawRecord;
+ try {
+ rec = JSON.parse(line) as RawRecord;
+ } catch {
+ continue;
+ }
+ const meta = rec.metadata ?? {};
+ // Use credit_issued_ns when available (the true start of the request's
+ // lifecycle), falling back to request_start_ns. Skip rows missing both.
+ const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+ const cEnd = meta.request_end_ns;
+ if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+ if (cStart < originNs) originNs = cStart;
+ if (cEnd > endNs) endNs = cEnd;
+
+ raw.push({
+ meta,
+ ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+ isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+ osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+ });
+ }
+
+ if (raw.length === 0) return null;
+ if (!Number.isFinite(originNs)) originNs = 0;
+
+ // Second pass: shift timestamps to be relative to originNs (smaller
+ // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+ const requests: RequestRecord[] = [];
+ for (const r of raw) {
+ const m = r.meta;
+ const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+ const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+ const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+ const end = (m.request_end_ns ?? originNs) - originNs;
+ requests.push({
+ cid: m.conversation_id ?? 'unknown',
+ ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+ wid: m.worker_id ?? 'unknown',
+ ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+ phase: m.benchmark_phase ?? 'unknown',
+ credit,
+ start,
+ ack,
+ end,
+ ttftMs: r.ttftMs,
+ isl: r.isl,
+ osl: r.osl,
+ cancelled: m.was_cancelled === true,
+ });
+ }
+
+ // Stable order so backfill output is deterministic.
+ requests.sort((a, b) => a.start - b.start);
+
+ return {
+ version: REQUEST_TIMELINE_VERSION,
+ startNs: originNs,
+ endNs,
+ durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+ requests,
+ };
+}
diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index d42429c9..0e1166aa 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -22,6 +22,8 @@ export { GPU_KEYS };
* stripped base is not in `GPU_KEYS`.
*/
export function hwToGpuKey(hw: string): string | null {
+ // Take the first segment before `-` as the canonical key. Subsumes all the
+ // prior explicit suffix strips (-nv, -amds, -dgxc-slurm, -p1, -cw, …).
const base = hw.toLowerCase().split('-')[0];
return GPU_KEYS.has(base) ? base : null;
}
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
expect(tracker.skips.unmappedHw).toBe(0);
expect(tracker.skips.noIslOsl).toBe(0);
expect(tracker.skips.dbError).toBe(0);
+ expect(tracker.skips.traceReplayMissing).toBe(0);
});
it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 6166ea44..401d197c 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,7 +8,10 @@ export interface Skips {
unmappedModel: number;
unmappedHw: number;
noIslOsl: number;
+ failedRun: number;
dbError: number;
+ /** Agentic point whose sibling `agentic_` artifact had no trace_replay files. */
+ traceReplayMissing: number;
}
export interface SkipSnapshot {
@@ -66,7 +69,15 @@ const MAX_DB_ERRORS = 10;
* @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
*/
export function createSkipTracker(): SkipTracker {
- const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+ const skips: Skips = {
+ badZip: 0,
+ unmappedModel: 0,
+ unmappedHw: 0,
+ noIslOsl: 0,
+ failedRun: 0,
+ dbError: 0,
+ traceReplayMissing: 0,
+ };
const unmappedModels = new Set();
const unmappedHws = new Set();
const unmappedPrecisions = new Set();
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..8cc03f2a
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,103 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
+
+type Sql = ReturnType;
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql Active `postgres` connection.
+ * @param benchmarkResultIds DB ids of the benchmark_results rows produced by
+ * the same `bmk_agentic_` artifact whose
+ * sibling `agentic_` directory holds these
+ * trace files.
+ * @param profileExportJsonl Raw bytes of `profile_export.jsonl`, or null.
+ * Gzipped before storage.
+ * @param serverMetricsCsv Raw bytes of `server_metrics_export.csv`, or null.
+ * Stored as-is.
+ * @param serverMetricsJson Raw bytes of `server_metrics_export.json` —
+ * per-scrape time-series of every Prometheus metric.
+ * Optional, gzipped before storage (~42x ratio).
+ */
+export async function insertTraceReplay(
+ sql: Sql,
+ benchmarkResultIds: number[],
+ profileExportJsonl: Buffer | null,
+ serverMetricsCsv: Buffer | null,
+ serverMetricsJson: Buffer | null = null,
+): Promise {
+ if (benchmarkResultIds.length === 0) return;
+ if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+ // Only link rows that don't already point at a trace_replay row — keeps
+ // re-ingest from inserting duplicate sibling blobs.
+ const unlinked = await sql<{ id: number }[]>`
+ select id from benchmark_results
+ where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+ and trace_replay_id is null
+ `;
+ if (unlinked.length === 0) return;
+
+ const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+ const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+ const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+ const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+ const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+
+ // Pre-compute aggregate stats + chart-ready time-series + per-request
+ // timeline so the detail page doesn't have to re-parse these blobs on
+ // every request. Each helper tolerates a null blob and falls back to
+ // a streaming parser for oversized server_metrics blobs.
+ const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
+ computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+ computeChartSeries(metricsJsonGz),
+ Promise.resolve(computeRequestTimeline(profileGz)),
+ ]);
+
+ const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+ insert into agentic_trace_replay (
+ profile_export_jsonl_gz,
+ profile_export_uncompressed_size,
+ server_metrics_csv,
+ server_metrics_csv_size,
+ server_metrics_json_gz,
+ server_metrics_json_uncompressed_size,
+ aggregate_stats,
+ chart_series,
+ request_timeline
+ )
+ values (
+ ${profileGz},
+ ${profileSize},
+ ${serverMetricsCsv},
+ ${csvSize},
+ ${metricsJsonGz},
+ ${metricsJsonSize},
+ ${sql.json(structuredClone(aggregateStats) as unknown as Parameters[0])},
+ ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters[0])},
+ ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters[0])}
+ )
+ returning id
+ `;
+
+ await sql`
+ update benchmark_results
+ set trace_replay_id = ${traceReplayId}
+ where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+ `;
+}
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index c345e662..eeb55313 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -45,6 +45,7 @@ import {
bulkUpsertAvailability,
insertServerLog,
} from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
import { ingestEvalRow } from './etl/eval-ingest';
import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -109,15 +110,30 @@ if (isDownloadMode) {
} catch {}
}
- const byName = new Map();
+ // Strip the trailing `__` token from each
+ // artifact name, then group by the resulting logical name and keep only
+ // the most recent per group. Without this, two artifacts produced on
+ // different runners for the same logical config (e.g. `…_h200-cw_00` and
+ // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty
+ // metrics can overwrite the good one via ON CONFLICT DO UPDATE.
+ //
+ // The runner pool name itself has no underscores (`h200-cw`,
+ // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip
+ // bounded — using `\w` here would over-match across earlier `_`
+ // separators and collapse different (conc, offload) variants into the
+ // same logical name.
+ const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/;
+ const byLogical = new Map();
for (const a of allArtifacts) {
- const existing = byName.get(a.name);
+ const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+ const existing = byLogical.get(key);
if (!existing || a.created_at > existing.created_at) {
- byName.set(a.name, a);
+ byLogical.set(key, a);
}
}
- for (const [name, artifact] of byName) {
+ for (const [, artifact] of byLogical) {
+ const name = artifact.name;
console.log(` ${name}`);
const zipPath = path.join(artifactsDir, 'artifact.zip');
execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
@@ -129,7 +145,7 @@ if (isDownloadMode) {
fs.unlinkSync(zipPath);
}
- console.log(`\n Downloaded ${byName.size} artifact(s)`);
+ console.log(`\n Downloaded ${byLogical.size} artifact(s)`);
// Fetch run attempt from API
const attemptStr = execSync(
@@ -194,6 +210,14 @@ const ARTIFACT_NAMES = {
changelog: 'changelog-metadata',
} as const;
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_` and
+ * its sibling `agentic_` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+ s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
function readJson(filePath: string): unknown {
try {
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -294,13 +318,14 @@ async function main(): Promise {
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
let totalNewBmk = 0,
@@ -311,6 +336,7 @@ async function main(): Promise {
let totalSamples = 0;
let totalSampleFiles = 0;
let totalChangelogs = 0;
+ let totalTraceReplayLinked = 0;
// ── Check for evals-only flag in changelog ────────────────────────────
const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -365,6 +391,56 @@ async function main(): Promise {
console.log(` Found ${serverLogPaths.size} server log artifact(s)`);
}
+ // Sibling aiperf artifacts: each `bmk_agentic_` is paired with an
+ // `agentic_` dir holding `profile_export.jsonl` and
+ // `server_metrics_export.csv`. The harness emits these under either a
+ // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+ // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+ // suffix so both names map to the same Map entry.
+ const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+ const traceReplayPaths = new Map<
+ string,
+ {
+ profileJsonl: string | null;
+ serverMetricsCsv: string | null;
+ serverMetricsJson: string | null;
+ }
+ >();
+ if (fs.existsSync(artifactsDir)) {
+ for (const d of fs.readdirSync(artifactsDir)) {
+ if (!d.startsWith('agentic_')) continue;
+ let profile: string | null = null;
+ let metrics: string | null = null;
+ let metricsJson: string | null = null;
+ for (const sub of TRACE_SUBDIRS) {
+ const dir = path.join(artifactsDir, d, sub);
+ if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue;
+ if (!profile) {
+ const p = path.join(dir, 'profile_export.jsonl');
+ if (fs.existsSync(p)) profile = p;
+ }
+ if (!metrics) {
+ const m = path.join(dir, 'server_metrics_export.csv');
+ if (fs.existsSync(m)) metrics = m;
+ }
+ if (!metricsJson) {
+ const j = path.join(dir, 'server_metrics_export.json');
+ if (fs.existsSync(j)) metricsJson = j;
+ }
+ }
+ if (!profile && !metrics && !metricsJson) continue;
+ const suffix = stripBmkAndAgenticPrefix(d);
+ traceReplayPaths.set(suffix, {
+ profileJsonl: profile,
+ serverMetricsCsv: metrics,
+ serverMetricsJson: metricsJson,
+ });
+ }
+ }
+ if (traceReplayPaths.size > 0) {
+ console.log(` Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+ }
+
const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
console.log(` Found ${allBmkFiles.length} benchmark JSON file(s)`);
@@ -415,6 +491,7 @@ async function main(): Promise {
framework: r.config.framework,
specMethod: r.config.specMethod,
disagg: r.config.disagg,
+ benchmarkType: r.benchmarkType,
});
}
@@ -431,12 +508,42 @@ async function main(): Promise {
}
}
}
+
+ // Trace-replay sibling lookup for agentic points only. The aiperf
+ // harness emits `agentic_/trace_replay/...` next to the
+ // `bmk_agentic_` artifact we just ingested.
+ if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+ const suffix = stripBmkAndAgenticPrefix(parentDir);
+ const trace = traceReplayPaths.get(suffix);
+ if (trace) {
+ try {
+ const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+ const metrics = trace.serverMetricsCsv
+ ? fs.readFileSync(trace.serverMetricsCsv)
+ : null;
+ const metricsJson = trace.serverMetricsJson
+ ? fs.readFileSync(trace.serverMetricsJson)
+ : null;
+ await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson);
+ totalTraceReplayLinked += insertedIds.length;
+ } catch (error: any) {
+ tracker.recordDbError(`trace_replay for ${suffix}`, error);
+ }
+ } else {
+ tracker.skips.traceReplayMissing++;
+ }
+ }
} catch (error: any) {
tracker.recordDbError(path.basename(file), error);
}
}
}
console.log(` Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+ if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+ console.log(
+ ` Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+ );
+ }
if (availRows.length > 0) {
try {
@@ -654,11 +761,17 @@ async function main(): Promise {
const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
const totalSkips =
- skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+ skips.badZip +
+ skips.unmappedModel +
+ skips.unmappedHw +
+ skips.noIslOsl +
+ skips.failedRun +
+ skips.dbError;
if (totalSkips > 0) {
console.log(`\n Skipped: ${totalSkips} rows`);
const skipLines: [string, number][] = [
['no isl/osl (old format)', skips.noIslOsl],
+ ['failed run (0 successful)', skips.failedRun],
['unmapped model', skips.unmappedModel],
['unmapped hw', skips.unmappedHw],
['bad/empty zip', skips.badZip],
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 9c17bfaf..b4a6fb95 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -457,6 +457,9 @@ async function mapWorkflowDir(
unmappedModel: local.skips.unmappedModel,
unmappedHw: local.skips.unmappedHw,
noIslOsl: local.skips.noIslOsl,
+ failedRun: local.skips.failedRun,
+ // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+ traceReplayMissing: local.skips.traceReplayMissing,
},
localUnmappedModels: new Set(local.unmappedModels),
localUnmappedHws: new Set(local.unmappedHws),
@@ -621,13 +624,14 @@ async function main(): Promise {
// Upsert availability rows only for successfully resolved configs
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
for (const r of allInserted) {
availRows.push({
@@ -639,6 +643,7 @@ async function main(): Promise {
framework: r.config.framework,
specMethod: r.config.specMethod,
disagg: r.config.disagg,
+ benchmarkType: r.benchmarkType,
});
}
if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index a3b62fe0..f868767e 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
const rows: {
configId: number;
- isl: number;
- osl: number;
+ benchmarkType: 'single_turn' | 'agentic_traces';
+ offloadMode: string;
+ isl: number | null;
+ osl: number | null;
conc: number;
image: string | null;
metrics: Record;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
rows.push({
configId,
+ benchmarkType: 'single_turn',
+ offloadMode: 'off',
isl: entry.isl,
osl: entry.osl,
conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
// to `rows` are exactly the valid ones.
const availRows: {
model: string;
- isl: number;
- osl: number;
+ isl: number | null;
+ osl: number | null;
precision: string;
hardware: string;
framework: string;
specMethod: string;
disagg: boolean;
+ benchmarkType: string;
}[] = [];
for (const entry of entries) {
const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
framework,
specMethod,
disagg,
+ benchmarkType: 'single_turn',
});
}
if (availRows.length > 0) {
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 25525e04..785d82c4 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -273,6 +273,7 @@ function toBenchmarkRow(
metrics?: Record,
): BenchmarkRow {
return {
+ id: br.id,
hardware: c.hardware,
framework: c.framework,
model: c.model,
@@ -290,6 +291,8 @@ function toBenchmarkRow(
decode_num_workers: c.decode_num_workers,
num_prefill_gpu: c.num_prefill_gpu,
num_decode_gpu: c.num_decode_gpu,
+ benchmark_type: br.benchmark_type ?? 'single_turn',
+ offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off',
isl: br.isl,
osl: br.osl,
conc: br.conc,
@@ -410,7 +413,11 @@ export function getAvailabilityData(): AvailabilityRow[] {
for (const a of s.availability) {
const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`;
if (validKeys.has(key)) {
- rows.push({ ...a, date: toDateString(a.date) });
+ rows.push({
+ ...a,
+ benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn',
+ date: toDateString(a.date),
+ });
}
}
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
new file mode 100644
index 00000000..8c712323
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest';
+
+import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates.js';
+
+describe('percentilesOf', () => {
+ it('returns null for empty input', () => {
+ expect(percentilesOf([])).toBeNull();
+ expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull();
+ });
+
+ it('computes percentiles for a simple integer range', () => {
+ // 1..100, evenly spaced — linear quantile is straightforward.
+ const xs = Array.from({ length: 100 }, (_, i) => i + 1);
+ const p = percentilesOf(xs);
+ expect(p).not.toBeNull();
+ expect(p!.n).toBe(100);
+ expect(p!.mean).toBeCloseTo(50.5, 6);
+ expect(p!.p50).toBeCloseTo(50.5, 6);
+ // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp.
+ expect(p!.p75).toBeCloseTo(75.25, 6);
+ expect(p!.p90).toBeCloseTo(90.1, 6);
+ expect(p!.p99).toBeCloseTo(99.01, 6);
+ });
+
+ it('filters out non-finite values before computing', () => {
+ const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]);
+ expect(p?.n).toBe(4);
+ expect(p?.mean).toBeCloseTo(2.5, 6);
+ });
+});
+
+describe('extractIslOsl', () => {
+ it('reads input/output sequence length from profiling records', () => {
+ const lines = [
+ JSON.stringify({
+ metadata: { benchmark_phase: 'profiling' },
+ metrics: {
+ input_sequence_length: { value: 100, unit: 'tokens' },
+ output_sequence_length: { value: 50, unit: 'tokens' },
+ },
+ }),
+ JSON.stringify({
+ metadata: { benchmark_phase: 'profiling' },
+ metrics: {
+ input_sequence_length: { value: 200, unit: 'tokens' },
+ output_sequence_length: { value: 75, unit: 'tokens' },
+ },
+ }),
+ // warmup record — should be ignored
+ JSON.stringify({
+ metadata: { benchmark_phase: 'warmup' },
+ metrics: {
+ input_sequence_length: { value: 9999, unit: 'tokens' },
+ output_sequence_length: { value: 9999, unit: 'tokens' },
+ },
+ }),
+ ];
+ const { isl, osl } = extractIslOsl(lines.join('\n'));
+ expect(isl).toEqual([100, 200]);
+ expect(osl).toEqual([50, 75]);
+ });
+});
+
+describe('extractServerMetricSamples', () => {
+ it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => {
+ const json = JSON.stringify({
+ metrics: {
+ 'vllm:kv_cache_usage_perc': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, end_ns: 1, avg: 0.1 },
+ { start_ns: 1, end_ns: 2, avg: 0.5 },
+ { start_ns: 2, end_ns: 3, avg: 0.9 },
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_hits': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, rate: 80 },
+ { start_ns: 1, rate: 50 },
+ { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0
+ ],
+ },
+ ],
+ },
+ 'vllm:prefix_cache_queries': {
+ series: [
+ {
+ timeslices: [
+ { start_ns: 0, rate: 100 }, // hit rate = 0.8
+ { start_ns: 1, rate: 100 }, // hit rate = 0.5
+ { start_ns: 2, rate: 0 },
+ ],
+ },
+ ],
+ },
+ },
+ });
+ const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+ expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]);
+ expect(prefixCacheHitRate).toEqual([0.8, 0.5]);
+ });
+
+ it('returns empty arrays when the JSON lacks the expected metric series', () => {
+ const out = extractServerMetricSamples(JSON.stringify({ metrics: {} }));
+ expect(out.kvCacheUtil).toEqual([]);
+ expect(out.prefixCacheHitRate).toEqual([]);
+ });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
new file mode 100644
index 00000000..1ad7fd7f
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -0,0 +1,421 @@
+/**
+ * Per-id aggregate stats for the "Aggregates across configs" view on the
+ * agentic detail page. Each id contributes one summary number per metric per
+ * percentile so the frontend can plot how each metric varies across the
+ * SKU's parallelism + concurrency configs.
+ *
+ * Sources:
+ * - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase)
+ * - `server_metrics_json` → time-series of KV cache utilization +
+ * prefix-cache hit rate per scrape interval
+ *
+ * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing
+ * or has no usable samples — frontend treats those as "no data".
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import type { DbClient } from '../connection.js';
+
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older.
+ * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
+ * import: the compute helper depends on the percentile utilities below.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ */
+export const STATS_VERSION = 2;
+
+export interface MetricPercentiles {
+ mean: number;
+ p50: number;
+ p75: number;
+ p90: number;
+ p99: number;
+ /** Sample count used to compute the percentiles. */
+ n: number;
+}
+
+export interface AgenticAggregate {
+ id: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record;
+
+/**
+ * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per
+ * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed
+ * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire
+ * size, so two of those = ~50 MB and three already trips the 64 MB cap).
+ * We fetch the two blob types in separate queries with different chunk
+ * sizes.
+ */
+const PROFILE_CHUNK_SIZE = 8;
+const SERVER_CHUNK_SIZE = 1;
+
+/** Linear-interpolated percentile (matches numpy default). */
+function quantile(sortedAsc: number[], q: number): number {
+ if (sortedAsc.length === 0) return Number.NaN;
+ if (sortedAsc.length === 1) return sortedAsc[0]!;
+ const pos = (sortedAsc.length - 1) * q;
+ const lo = Math.floor(pos);
+ const hi = Math.ceil(pos);
+ if (lo === hi) return sortedAsc[lo]!;
+ return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+ let s = 0;
+ for (const x of xs) s += x;
+ return s / xs.length;
+}
+
+/** Compute the percentile bundle for an array of samples; null if empty. */
+export function percentilesOf(samples: number[]): MetricPercentiles | null {
+ const clean = samples.filter((v) => Number.isFinite(v));
+ if (clean.length === 0) return null;
+ const sorted = [...clean].toSorted((a, b) => a - b);
+ return {
+ mean: meanOf(sorted),
+ p50: quantile(sorted, 0.5),
+ p75: quantile(sorted, 0.75),
+ p90: quantile(sorted, 0.9),
+ p99: quantile(sorted, 0.99),
+ n: sorted.length,
+ };
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+ if (typeof v === 'number') return v;
+ if (v && typeof v === 'object' && 'value' in v) {
+ const inner = (v as { value?: unknown }).value;
+ if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+ }
+ return undefined;
+}
+
+interface ProfileRecord {
+ metadata?: { benchmark_phase?: string };
+ metrics?: {
+ input_sequence_length?: { value?: number } | number;
+ output_sequence_length?: { value?: number } | number;
+ };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+ const isl: number[] = [];
+ const osl: number[] = [];
+ for (const line of jsonl.split('\n')) {
+ if (!line) continue;
+ let rec: ProfileRecord;
+ try {
+ rec = JSON.parse(line) as ProfileRecord;
+ } catch {
+ continue;
+ }
+ if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+ const m = rec.metrics ?? {};
+ const i = readNum(m.input_sequence_length);
+ const o = readNum(m.output_sequence_length);
+ if (typeof i === 'number') isl.push(i);
+ if (typeof o === 'number') osl.push(o);
+ }
+ return { isl, osl };
+}
+
+interface TimeSlice {
+ start_ns?: number;
+ end_ns?: number;
+ avg?: number;
+ rate?: number;
+ count?: number;
+ sum?: number;
+}
+interface Series {
+ labels?: Record;
+ timeslices?: TimeSlice[];
+}
+interface MetricMeta {
+ series?: Series[];
+}
+interface MetricsJson {
+ metrics?: Record;
+}
+
+/**
+ * Aggregate a per-timeslice field across all series of a metric, indexed by
+ * the timeslice's `start_ns`. vllm reports one series per engine on
+ * multi-engine DP/PP deployments, so we sum (or average) across engines to
+ * get the cluster-wide value at each timeslice.
+ *
+ * `field` selects which numeric field on a timeslice to read (`avg` for
+ * gauges, `rate` for counter deltas). `combine` controls cross-engine math:
+ * 'sum' for running/waiting/throughput counters where the cluster total is
+ * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per
+ * engine and should be averaged across engines for the cluster view.
+ */
+function aggregateSeriesByStart(
+ metricSeries: readonly Series[] | undefined,
+ field: 'avg' | 'rate',
+ combine: 'sum' | 'avg',
+): Map {
+ const sums = new Map();
+ const counts = new Map();
+ for (const s of metricSeries ?? []) {
+ for (const ts of s.timeslices ?? []) {
+ if (typeof ts.start_ns !== 'number') continue;
+ const v = ts[field];
+ if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+ sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+ counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+ }
+ }
+ if (combine === 'sum') return sums;
+ const out = new Map();
+ for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+ return out;
+}
+
+/**
+ * Parse the server_metrics_json → time-series arrays for KV cache util and
+ * prefix cache hit rate (per-interval, computed from the prometheus
+ * counters the same way trace-server-metrics does it).
+ *
+ * Aggregates across all engine series so multi-engine DP/PP deployments are
+ * counted correctly (previously we only read engine 0).
+ */
+export function extractServerMetricSamples(json: string): {
+ kvCacheUtil: number[];
+ prefixCacheHitRate: number[];
+} {
+ const parsed = JSON.parse(json) as MetricsJson;
+ const metrics = parsed.metrics ?? {};
+
+ // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
+ // value stays a percentage; summing would give meaningless 0..N.
+ const kvSeriesAll =
+ metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+ const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
+
+ // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
+ // all engines. Sum first, then divide.
+ const hitsAll =
+ metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
+ const queriesAll =
+ metrics['vllm:prefix_cache_queries']?.series ??
+ metrics['vllm:gpu_prefix_cache_queries']?.series;
+ const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
+ const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
+ const prefixCacheHitRate: number[] = [];
+ for (const [t, h] of hitsByT) {
+ const q = qByT.get(t);
+ if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q);
+ }
+
+ return { kvCacheUtil, prefixCacheHitRate };
+}
+
+/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
+const TARGET_METRIC_KEYS = new Set([
+ 'vllm:kv_cache_usage_perc',
+ 'vllm:gpu_cache_usage_perc', // older fallback name
+ 'vllm:prefix_cache_hits',
+ 'vllm:prefix_cache_queries',
+ 'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+ 'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics
+ * we need. Avoids the Node 512 MB string cap that JSON.parse hits on
+ * server_metrics blobs from high-conc TP+EP runs (which can decompress to
+ * >500 MB because vllm dumps `cache_config_info` every scrape interval).
+ *
+ * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') →
+ * StreamObject (one metric per chunk) → keep only the keys we care about.
+ *
+ * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the
+ * synchronous fast path so callers can use either interchangeably.
+ */
+async function streamExtractServerMetricSamples(
+ buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+ const collected: Record = {};
+ // stream-json's TypeScript types don't compose cleanly with node:stream's
+ // pipeline() generic, and several `.pipe()`/event APIs are typed loosely —
+ // cast to any for this local pipe chain. It works at runtime.
+ // stream-json composes transforms via stream-chain. `pick`/`streamObject`
+ // each return a Transform when called; `chain([...])` wires them.
+ /* eslint-disable @typescript-eslint/no-explicit-any */
+ const pipeline = chain([
+ Readable.from(buffer),
+ createGunzip(),
+ parser(),
+ pick({ filter: 'metrics' }),
+ streamObject(),
+ ]);
+ await new Promise((resolve, reject) => {
+ (pipeline as any).on('data', (chunk: unknown) => {
+ const { key, value } = chunk as { key: string; value: MetricMeta };
+ if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+ });
+ (pipeline as any).on('end', resolve);
+ (pipeline as any).on('error', reject);
+ });
+ /* eslint-enable @typescript-eslint/no-explicit-any */
+ return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+export async function getAgenticAggregates(
+ sql: DbClient,
+ benchmarkResultIds: number[],
+): Promise {
+ if (benchmarkResultIds.length === 0) return {};
+
+ const result: AgenticAggregateMap = {};
+
+ // Fast path: read the pre-computed `aggregate_stats` JSONB written by the
+ // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One
+ // round-trip pulls everything we need for every requested id with no blob
+ // decompression, so the slow blob-parsing fallback only runs for ids
+ // whose stats are missing or were produced by an older `STATS_VERSION`.
+ const statsRows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.aggregate_stats as stats
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${benchmarkResultIds}::bigint[])
+ `) as {
+ benchmark_result_id: number;
+ stats: AggregateStatsRow | null;
+ }[];
+
+ const idsNeedingProfile: number[] = [];
+ const idsNeedingServer: number[] = [];
+ for (const row of statsRows) {
+ const id = Number(row.benchmark_result_id);
+ const agg = blankAggregate(id);
+ if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+ agg.isl = row.stats.isl ?? null;
+ agg.osl = row.stats.osl ?? null;
+ agg.kvCacheUtil = row.stats.kvCacheUtil ?? null;
+ agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null;
+ } else {
+ // No stats (or stale version) — schedule the blob-parse fallback below
+ // so the response still surfaces data. Backfill should drain these.
+ idsNeedingProfile.push(id);
+ idsNeedingServer.push(id);
+ }
+ result[id] = agg;
+ }
+ // Also fall back for ids that didn't return a row at all (no trace_replay
+ // link) — keep the caller contract: every id we know about lands in the map.
+ for (const id of benchmarkResultIds) {
+ if (!(id in result)) result[id] = blankAggregate(id);
+ }
+
+ if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) {
+ return result;
+ }
+
+ // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
+ for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
+ const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
+ const rows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.profile_export_jsonl_gz as profile_blob
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${chunk}::bigint[])
+ `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+ for (const row of rows) {
+ const id = Number(row.benchmark_result_id);
+ result[id] ??= blankAggregate(id);
+ if (row.profile_blob) {
+ try {
+ const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+ const { isl, osl } = extractIslOsl(jsonl);
+ result[id].isl = percentilesOf(isl);
+ result[id].osl = percentilesOf(osl);
+ } catch {
+ // ignore malformed blob
+ }
+ }
+ }
+ }
+ // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ───────
+ // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
+ // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
+ // path runs at most once per sibling set.
+ for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) {
+ const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE);
+ const rows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.server_metrics_json_gz as server_blob
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${chunk}::bigint[])
+ `) as { benchmark_result_id: number; server_blob: Buffer | null }[];
+ for (const row of rows) {
+ const id = Number(row.benchmark_result_id);
+ result[id] ??= blankAggregate(id);
+ if (!row.server_blob) continue;
+ let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+ try {
+ const json = gunzipSync(row.server_blob).toString('utf8');
+ parsed = extractServerMetricSamples(json);
+ } catch (error) {
+ // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose
+ // server_metrics_json decompresses past Node's max string length.
+ // Stream-parse to extract just the metric subtrees we care about.
+ const code = error && (error as NodeJS.ErrnoException).code;
+ const msg = error instanceof Error ? error.message : String(error);
+ if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+ try {
+ parsed = await streamExtractServerMetricSamples(row.server_blob);
+ } catch {
+ // stream fallback failed too — leave nulls
+ }
+ }
+ }
+ if (parsed) {
+ result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
+ result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+ }
+ }
+ }
+ return result;
+}
+
+/** Shape of the JSONB column when read back via postgres-js. */
+interface AggregateStatsRow {
+ version: number;
+ isl: MetricPercentiles | null;
+ osl: MetricPercentiles | null;
+ kvCacheUtil: MetricPercentiles | null;
+ prefixCacheHitRate: MetricPercentiles | null;
+ normalizedSessionTimeS: number | null;
+ p90PrefillTpsPerUser: number | null;
+}
+
+function blankAggregate(id: number): AgenticAggregate {
+ return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
+}
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
new file mode 100644
index 00000000..245a1170
--- /dev/null
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -0,0 +1,132 @@
+/**
+ * Find all benchmark_results that share the same SKU (hardware + framework +
+ * model + precision + spec_method + disagg + benchmark_type + workflow_run)
+ * as the given point. Used by the detail page to render a "switch between
+ * concs / parallelisms" navigator within a single run.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface BenchmarkSibling {
+ id: number;
+ conc: number;
+ /** "on" | "off" | null. */
+ offload_mode: string | null;
+ decode_tp: number;
+ decode_ep: number;
+ prefill_tp: number;
+ prefill_ep: number;
+ num_prefill_gpu: number;
+ num_decode_gpu: number;
+ disagg: boolean;
+ /** True if this row IS the point passed in. */
+ is_current: boolean;
+ /** Whether the row has a stored trace_replay blob (for navigation hint). */
+ has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ benchmark_type: string;
+ /** Human-readable workflow_run summary so the page header can hint at provenance. */
+ github_run_id: number;
+ date: string;
+}
+
+export interface BenchmarkSiblings {
+ sku: BenchmarkSku;
+ siblings: BenchmarkSibling[];
+}
+
+export async function getBenchmarkSiblings(
+ sql: DbClient,
+ benchmarkResultId: number,
+): Promise {
+ // Step 1: resolve the SKU defining fields for the requested point.
+ const seed = (await sql`
+ select
+ c.hardware, c.framework, c.model, c.precision, c.spec_method,
+ br.benchmark_type, br.workflow_run_id, br.date::text,
+ wr.github_run_id
+ from benchmark_results br
+ join configs c on c.id = br.config_id
+ join workflow_runs wr on wr.id = br.workflow_run_id
+ where br.id = ${benchmarkResultId}
+ `) as unknown as {
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ benchmark_type: string;
+ workflow_run_id: number;
+ date: string;
+ github_run_id: number;
+ }[];
+ const root = seed[0];
+ if (!root) return null;
+
+ // Step 2: pull every sibling row sharing the SKU within the same workflow_run.
+ const rows = (await sql`
+ select
+ br.id, br.conc, br.offload_mode,
+ c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
+ c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+ (br.trace_replay_id is not null) as has_trace
+ from benchmark_results br
+ join configs c on c.id = br.config_id
+ where br.workflow_run_id = ${root.workflow_run_id}
+ and br.benchmark_type = ${root.benchmark_type}
+ and c.hardware = ${root.hardware}
+ and c.framework = ${root.framework}
+ and c.model = ${root.model}
+ and c.precision = ${root.precision}
+ and c.spec_method = ${root.spec_method}
+ order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc
+ `) as unknown as {
+ id: number;
+ conc: number;
+ offload_mode: string | null;
+ decode_tp: number;
+ decode_ep: number;
+ prefill_tp: number;
+ prefill_ep: number;
+ num_prefill_gpu: number;
+ num_decode_gpu: number;
+ disagg: boolean;
+ has_trace: boolean;
+ }[];
+
+ const siblings: BenchmarkSibling[] = rows.map((r) => ({
+ id: Number(r.id),
+ conc: r.conc,
+ offload_mode: r.offload_mode,
+ decode_tp: r.decode_tp,
+ decode_ep: r.decode_ep,
+ prefill_tp: r.prefill_tp,
+ prefill_ep: r.prefill_ep,
+ num_prefill_gpu: r.num_prefill_gpu,
+ num_decode_gpu: r.num_decode_gpu,
+ disagg: r.disagg,
+ is_current: Number(r.id) === benchmarkResultId,
+ has_trace: r.has_trace,
+ }));
+
+ return {
+ sku: {
+ hardware: root.hardware,
+ framework: root.framework,
+ model: root.model,
+ precision: root.precision,
+ spec_method: root.spec_method,
+ benchmark_type: root.benchmark_type,
+ github_run_id: Number(root.github_run_id),
+ date: root.date,
+ },
+ siblings,
+ };
+}
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 1c30b1fd..2291dc0c 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -1,6 +1,13 @@
import type { DbClient } from '../connection.js';
export interface BenchmarkRow {
+ /**
+ * Stable per-point id from benchmark_results. Used by the frontend to look
+ * up associated detail blobs (e.g. trace_replay histograms).
+ * Number is fine in TS but it's a Postgres bigint — Date arithmetic on huge
+ * runs is hypothetically lossy, in practice well below Number.MAX_SAFE_INTEGER.
+ */
+ id: number;
hardware: string;
framework: string;
model: string;
@@ -18,9 +25,13 @@ export interface BenchmarkRow {
decode_num_workers: number;
num_prefill_gpu: number;
num_decode_gpu: number;
- isl: number;
- osl: number;
+ benchmark_type: string;
+ // Null for agentic_traces; numeric for single_turn fixed-seq runs.
+ isl: number | null;
+ osl: number | null;
conc: number;
+ /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+ offload_mode: string;
image: string | null;
metrics: Record;
date: string;
@@ -42,8 +53,56 @@ export async function getLatestBenchmarks(
modelKey: string | string[],
date?: string,
exact?: boolean,
+ /**
+ * If set, filter to a specific GitHub Actions workflow run.
+ * Bypasses the "latest per config" logic — when two runs landed on the same
+ * date and the user picked one in the run selector, this scopes the chart
+ * data to that run only. Value matches the URL param `g_runid` (a
+ * stringified github_run_id, not the DB id).
+ */
+ runId?: string,
): Promise {
const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
+ if (runId) {
+ const rows = await sql`
+ SELECT
+ br.id,
+ c.hardware,
+ c.framework,
+ c.model,
+ c.precision,
+ c.spec_method,
+ c.disagg,
+ c.is_multinode,
+ c.prefill_tp,
+ c.prefill_ep,
+ c.prefill_dp_attention,
+ c.prefill_num_workers,
+ c.decode_tp,
+ c.decode_ep,
+ c.decode_dp_attention,
+ c.decode_num_workers,
+ c.num_prefill_gpu,
+ c.num_decode_gpu,
+ br.benchmark_type,
+ br.offload_mode,
+ br.isl,
+ br.osl,
+ br.conc,
+ br.image,
+ br.metrics,
+ br.date::text,
+ CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url
+ FROM benchmark_results br
+ JOIN configs c ON c.id = br.config_id
+ JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+ WHERE c.model = ANY(${modelKeys})
+ AND br.error IS NULL
+ AND wr.github_run_id = ${runId}::bigint
+ ORDER BY br.config_id, br.conc, br.isl, br.osl
+ `;
+ return rows as unknown as BenchmarkRow[];
+ }
if (date) {
// Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
// exact=true: only return data from this exact date (for GPU comparison)
@@ -51,6 +110,7 @@ export async function getLatestBenchmarks(
const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
const rows = await sql`
SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+ br.id,
c.hardware,
c.framework,
c.model,
@@ -68,6 +128,8 @@ export async function getLatestBenchmarks(
c.decode_num_workers,
c.num_prefill_gpu,
c.num_decode_gpu,
+ br.benchmark_type,
+ br.offload_mode,
br.isl,
br.osl,
br.conc,
@@ -89,6 +151,7 @@ export async function getLatestBenchmarks(
// No date filter: use materialized view for instant lookups
const rows = await sql`
SELECT
+ lb.id,
c.hardware,
c.framework,
c.model,
@@ -106,6 +169,8 @@ export async function getLatestBenchmarks(
c.decode_num_workers,
c.num_prefill_gpu,
c.num_decode_gpu,
+ lb.benchmark_type,
+ lb.offload_mode,
lb.isl,
lb.osl,
lb.conc,
@@ -153,6 +218,7 @@ export async function getAllBenchmarksForHistory(
c.decode_num_workers,
c.num_prefill_gpu,
c.num_decode_gpu,
+ br.benchmark_type,
br.isl,
br.osl,
br.conc,
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..321434be
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest';
+
+import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+
+/** Build one aiperf JSONL record for the synthetic fixture. */
+function rec(
+ conversation_id: string,
+ turn_index: number,
+ fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number },
+): string {
+ return JSON.stringify({
+ metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' },
+ metrics: {
+ request_latency: { value: fields.latency_ms, unit: 'ms' },
+ time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+ input_sequence_length: { value: fields.isl, unit: 'tokens' },
+ output_sequence_length: { value: fields.osl, unit: 'tokens' },
+ },
+ });
+}
+
+describe('computeDerivedFromBlob', () => {
+ it('returns nulls when no usable records', () => {
+ const out = computeDerivedFromBlob('');
+ expect(out.normalized_session_time_s).toBeNull();
+ expect(out.p90_prefill_tps_per_user).toBeNull();
+ });
+
+ it('rescales single-session time and computes P90 prefill', () => {
+ // One session, two turns. load = (100+50) + (200+50) = 400.
+ // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s.
+ const jsonl = [
+ rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+ rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+ ].join('\n');
+ const out = computeDerivedFromBlob(jsonl);
+ expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
+ // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200.
+ expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+ });
+
+ it('rescales times across sessions with unequal load', () => {
+ // s1: 1 turn, load = 100, T = 1s
+ // s2: 1 turn, load = 300, T = 3s
+ // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2
+ // Mean T̃ = 2.0
+ const jsonl = [
+ rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }),
+ rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }),
+ ].join('\n');
+ const out = computeDerivedFromBlob(jsonl);
+ expect(out.normalized_session_time_s).toBeCloseTo(2, 6);
+ });
+
+ it('drops records missing required fields and skips non-profiling phase', () => {
+ const lines = [
+ rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+ // missing TTFT — should be skipped
+ JSON.stringify({
+ metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' },
+ metrics: {
+ request_latency: { value: 1000, unit: 'ms' },
+ input_sequence_length: { value: 100, unit: 'tokens' },
+ output_sequence_length: { value: 50, unit: 'tokens' },
+ },
+ }),
+ // warmup phase — should be skipped
+ JSON.stringify({
+ metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' },
+ metrics: {
+ request_latency: { value: 9999, unit: 'ms' },
+ time_to_first_token: { value: 9999, unit: 'ms' },
+ input_sequence_length: { value: 100, unit: 'tokens' },
+ output_sequence_length: { value: 50, unit: 'tokens' },
+ },
+ }),
+ ];
+ const out = computeDerivedFromBlob(lines.join('\n'));
+ expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
+ expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+ });
+
+ it('p90 across turns: 10-turn session picks the right rank', () => {
+ // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910.
+ const turns = Array.from({ length: 10 }, (_, i) =>
+ rec('s1', i, {
+ isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens
+ osl: 10,
+ ttft_ms: 1000, // 1 second → rates: 100..1000 tps
+ latency_ms: 1500,
+ }),
+ );
+ const out = computeDerivedFromBlob(turns.join('\n'));
+ expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+ });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
new file mode 100644
index 00000000..35a4b76c
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -0,0 +1,264 @@
+/**
+ * Live-computed per-point metrics derived from the stored aiperf
+ * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB
+ * because they require grouping by `conversation_id` and aggregating per
+ * session — work that's cheap once per agentic point but adds up to be
+ * meaningful only when actually plotted.
+ *
+ * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal
+ * (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of
+ * per-turn `request_latency` per session (inter-turn tool/thinking gaps are
+ * inherently excluded since we only sum the active GPU time, not wallclock).
+ * Each session's time is rescaled by `mean_load / session_load`, where load
+ * is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
+ *
+ * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ * Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn
+ * in every session — the per-session percentile + cross-session mean
+ * sandwich was discarded because it just dampens tail behavior.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+import { STATS_VERSION } from './agentic-aggregates';
+
+export interface DerivedAgenticMetric {
+ /** benchmark_results.id this entry belongs to. */
+ id: number;
+ /** Mean normalized session time in seconds. */
+ normalized_session_time_s: number | null;
+ /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
+ p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record;
+
+/**
+ * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
+ * HTTP driver caps responses at 64 MB — chunk to stay well under.
+ */
+const QUERY_CHUNK_SIZE = 6;
+
+interface RecordMetrics {
+ request_latency?: { value?: number; unit?: string } | number;
+ time_to_first_token?: { value?: number; unit?: string } | number;
+ input_sequence_length?: { value?: number } | number;
+ output_sequence_length?: { value?: number } | number;
+}
+
+interface RecordMetadata {
+ conversation_id?: string;
+ turn_index?: number;
+ benchmark_phase?: string;
+}
+
+interface ProfileRecord {
+ metadata?: RecordMetadata;
+ metrics?: RecordMetrics;
+}
+
+interface TurnFields {
+ request_latency_ms: number;
+ ttft_ms: number;
+ isl: number;
+ osl: number;
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+ if (typeof v === 'number') return v;
+ if (v && typeof v === 'object' && 'value' in v) {
+ const inner = (v as { value?: unknown }).value;
+ if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+ }
+ return undefined;
+}
+
+function extractTurn(rec: ProfileRecord): TurnFields | null {
+ const m = rec.metrics ?? {};
+ const rl = readNum(m.request_latency);
+ const tt = readNum(m.time_to_first_token);
+ const isl = readNum(m.input_sequence_length);
+ const osl = readNum(m.output_sequence_length);
+ if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
+ if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+ return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
+}
+
+/** Linear-interpolated percentile (matches numpy's default linear method). */
+function quantile(sortedAsc: number[], q: number): number {
+ if (sortedAsc.length === 0) return Number.NaN;
+ if (sortedAsc.length === 1) return sortedAsc[0]!;
+ const pos = (sortedAsc.length - 1) * q;
+ const lo = Math.floor(pos);
+ const hi = Math.ceil(pos);
+ if (lo === hi) return sortedAsc[lo]!;
+ return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+ if (xs.length === 0) return Number.NaN;
+ let s = 0;
+ for (const x of xs) s += x;
+ return s / xs.length;
+}
+
+/**
+ * Parse one point's JSONL and return the two derived metrics. Returns
+ * `{ session_time: null, prefill: null }` if the blob has no usable records.
+ */
+export function computeDerivedFromBlob(jsonl: string): {
+ normalized_session_time_s: number | null;
+ p90_prefill_tps_per_user: number | null;
+} {
+ // Group records by conversation_id, filter to the profiling phase.
+ const bySession = new Map();
+ for (const line of jsonl.split('\n')) {
+ if (!line) continue;
+ let rec: ProfileRecord;
+ try {
+ rec = JSON.parse(line) as ProfileRecord;
+ } catch {
+ continue;
+ }
+ if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+ const sid = rec.metadata?.conversation_id;
+ if (!sid) continue;
+ const turn = extractTurn(rec);
+ if (!turn) continue;
+ let list = bySession.get(sid);
+ if (!list) {
+ list = [];
+ bySession.set(sid, list);
+ }
+ list.push(turn);
+ }
+ if (bySession.size === 0) {
+ return { normalized_session_time_s: null, p90_prefill_tps_per_user: null };
+ }
+
+ // Per-session aggregates for session time; per-turn prefill rates pool into
+ // a single global array so the percentile sees the full distribution.
+ const sessionTimesS: number[] = [];
+ const sessionLoads: number[] = [];
+ const allPrefillRates: number[] = [];
+ for (const turns of bySession.values()) {
+ let timeMs = 0;
+ let load = 0;
+ for (const t of turns) {
+ timeMs += t.request_latency_ms;
+ load += t.isl + t.osl;
+ const ttftSec = t.ttft_ms / 1000;
+ if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
+ }
+ if (load > 0) {
+ sessionTimesS.push(timeMs / 1000);
+ sessionLoads.push(load);
+ }
+ }
+
+ // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+ let normalized: number | null = null;
+ if (sessionTimesS.length > 0) {
+ const meanLoad = meanOf(sessionLoads);
+ if (meanLoad > 0) {
+ const scaled: number[] = [];
+ for (let i = 0; i < sessionTimesS.length; i++) {
+ const ti = sessionTimesS[i]!;
+ const li = sessionLoads[i]!;
+ if (li > 0) scaled.push(ti * (meanLoad / li));
+ }
+ normalized = scaled.length > 0 ? meanOf(scaled) : null;
+ }
+ }
+
+ let prefill: number | null = null;
+ if (allPrefillRates.length > 0) {
+ allPrefillRates.sort((a, b) => a - b);
+ prefill = quantile(allPrefillRates, 0.9);
+ }
+
+ return {
+ normalized_session_time_s: normalized,
+ p90_prefill_tps_per_user: prefill,
+ };
+}
+
+export async function getDerivedAgenticMetrics(
+ sql: DbClient,
+ benchmarkResultIds: number[],
+): Promise {
+ if (benchmarkResultIds.length === 0) return {};
+
+ const result: DerivedAgenticMetricMap = {};
+
+ // Fast path: read the pre-computed values out of `aggregate_stats`. The
+ // ingest pipeline computes both metrics in the same pass that produces the
+ // percentile bundles, so a single SQL round-trip covers most ids without
+ // touching the gzipped profile blob.
+ const statsRows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.aggregate_stats as stats
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${benchmarkResultIds}::bigint[])
+ `) as {
+ benchmark_result_id: number;
+ stats: {
+ version?: number;
+ normalizedSessionTimeS?: number | null;
+ p90PrefillTpsPerUser?: number | null;
+ } | null;
+ }[];
+
+ const idsNeedingBlob: number[] = [];
+ for (const row of statsRows) {
+ const id = Number(row.benchmark_result_id);
+ if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+ result[id] = {
+ id,
+ normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
+ p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+ };
+ } else {
+ idsNeedingBlob.push(id);
+ }
+ }
+
+ if (idsNeedingBlob.length === 0) return result;
+
+ // Fallback: parse the profile blob directly. Used for rows whose
+ // `aggregate_stats` is null or computed by an older STATS_VERSION; the
+ // backfill script drains the population so this path should be rare.
+ const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+ for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
+ const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
+ const chunkRows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.profile_export_jsonl_gz as blob
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${chunk}::bigint[])
+ and atr.profile_export_jsonl_gz is not null
+ `) as { benchmark_result_id: number; blob: Buffer }[];
+ rows.push(...chunkRows);
+ }
+
+ for (const row of rows) {
+ try {
+ const jsonl = gunzipSync(row.blob).toString('utf8');
+ const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl);
+ result[Number(row.benchmark_result_id)] = {
+ id: Number(row.benchmark_result_id),
+ normalized_session_time_s,
+ p90_prefill_tps_per_user,
+ };
+ } catch {
+ // Skip malformed blobs silently — frontend treats missing ids as "no data".
+ }
+ }
+ return result;
+}
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
new file mode 100644
index 00000000..2bd3e251
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.ts
@@ -0,0 +1,48 @@
+/**
+ * Per-request timeline for the agentic detail page's Gantt view.
+ *
+ * Backed by `agentic_trace_replay.request_timeline` (pre-computed at
+ * ingest time, see `etl/compute-request-timeline.ts`). The fast path is
+ * a single SQL row read; the slow path re-computes from
+ * `profile_export_jsonl_gz` and is only taken when the column is missing
+ * or the stored `REQUEST_TIMELINE_VERSION` is stale.
+ */
+
+import {
+ REQUEST_TIMELINE_VERSION,
+ computeRequestTimeline,
+ type RequestTimeline,
+} from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
+
+interface RawRow {
+ blob: Buffer | null;
+ request_timeline: RequestTimeline | null;
+}
+
+export async function getRequestTimeline(
+ sql: DbClient,
+ benchmarkResultId: number,
+): Promise {
+ const rows = (await sql`
+ select
+ atr.profile_export_jsonl_gz as blob,
+ atr.request_timeline
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = ${benchmarkResultId}
+ `) as unknown as RawRow[];
+ const row = rows[0];
+ if (!row) return null;
+
+ // Fast path: pre-computed timeline at the current version.
+ if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) {
+ return row.request_timeline;
+ }
+
+ // Slow path: recompute from the blob (rare — only stale/missing rows).
+ return computeRequestTimeline(row.blob);
+}
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
new file mode 100644
index 00000000..20ebc0d5
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -0,0 +1,95 @@
+/**
+ * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl`
+ * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller
+ * passes the set of `benchmark_results.id`s it wants and receives one entry
+ * per id that actually has a trace_replay blob (others are silently skipped).
+ *
+ * The JSONL has one JSON object per request with the shape:
+ * { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } }
+ *
+ * Returns raw arrays rather than pre-binned histograms — payload stays tiny
+ * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin
+ * however it wants.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface TraceHistogramPoint {
+ /** benchmark_results.id this entry belongs to. */
+ id: number;
+ /** Input sequence length (tokens) per completed request. */
+ isl: number[];
+ /** Output sequence length (tokens) per completed request. */
+ osl: number[];
+}
+
+export type TraceHistogramMap = Record;
+
+/**
+ * Cap the number of blobs we pull in a single Neon HTTP query — the serverless
+ * driver returns 507 ("response is too large, max 64 MB") if the combined gzip
+ * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB
+ * compressed, so we stay well below the cap at 12.
+ */
+const QUERY_CHUNK_SIZE = 12;
+
+export async function getTraceHistograms(
+ sql: DbClient,
+ benchmarkResultIds: number[],
+): Promise {
+ if (benchmarkResultIds.length === 0) return {};
+
+ const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+ for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+ const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+ const chunkRows = (await sql`
+ select
+ br.id as benchmark_result_id,
+ atr.profile_export_jsonl_gz as blob
+ from benchmark_results br
+ join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = any(${chunk}::bigint[])
+ and atr.profile_export_jsonl_gz is not null
+ `) as { benchmark_result_id: number; blob: Buffer }[];
+ rows.push(...chunkRows);
+ }
+
+ const result: TraceHistogramMap = {};
+ for (const row of rows) {
+ try {
+ const jsonl = gunzipSync(row.blob).toString('utf8');
+ const isl: number[] = [];
+ const osl: number[] = [];
+ for (const line of jsonl.split('\n')) {
+ if (!line) continue;
+ let rec: { metrics?: Record };
+ try {
+ rec = JSON.parse(line);
+ } catch {
+ continue;
+ }
+ const m = rec.metrics ?? {};
+ const islVal = readMetric(m['input_sequence_length']);
+ const oslVal = readMetric(m['output_sequence_length']);
+ if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal);
+ if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal);
+ }
+ result[Number(row.benchmark_result_id)] = {
+ id: Number(row.benchmark_result_id),
+ isl,
+ osl,
+ };
+ } catch {
+ // Drop malformed blobs silently — caller treats missing ids as "no data".
+ }
+ }
+ return result;
+}
+
+function readMetric(v: { value?: number } | number | undefined): number | undefined {
+ if (v === undefined || v === null) return undefined;
+ if (typeof v === 'number') return v;
+ return v.value;
+}
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
new file mode 100644
index 00000000..624b6ed3
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -0,0 +1,156 @@
+/**
+ * Time-series view of one agentic benchmark point: chart-ready arrays for
+ * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS,
+ * and per-source prompt-token counts.
+ *
+ * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest
+ * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL
+ * row read; the slow path re-computes from `server_metrics_json_gz` and is
+ * only taken when the column is missing or the stored
+ * `CHART_SERIES_VERSION` is stale (the backfill script should drain that).
+ */
+
+import {
+ CHART_SERIES_VERSION,
+ computeChartSeries,
+ type ChartSeries,
+ type QueueDepthPoint,
+ type TimeSeriesPoint,
+} from '../etl/compute-chart-series';
+
+import type { DbClient } from '../connection.js';
+
+export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
+
+export interface PointMeta {
+ id: number;
+ hardware: string;
+ framework: string;
+ model: string;
+ precision: string;
+ spec_method: string;
+ disagg: boolean;
+ conc: number;
+ offload_mode: string | null;
+ isl: number | null;
+ osl: number | null;
+ benchmark_type: string;
+ date: string;
+ /** GitHub Actions run URL for jumping to the source. */
+ run_url: string | null;
+ /** Cumulative end-of-run cache-hit number the dashboard already shows. */
+ server_gpu_cache_hit_rate: number | null;
+ /** Cumulative end-of-run CPU offload cache-hit. */
+ server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+ /** Point context — hardware, model, conc, etc. for the page header. */
+ meta: PointMeta;
+ /** ns wall-clock of the first window's start; for debugging only. */
+ startNs: number;
+ /** ns wall-clock of the last window's end. */
+ endNs: number;
+ /** Total benchmark window in seconds. */
+ durationS: number;
+ /** Number of 1Hz windows captured. */
+ timeslicesCount: number;
+ /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */
+ kvCacheUsage: TimeSeriesPoint[];
+ /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */
+ prefixCacheHitRate: TimeSeriesPoint[];
+ /** Request queue depth: running, waiting, total per scrape. */
+ queueDepth: QueueDepthPoint[];
+ /**
+ * Per-source prompt-token counts over time (counter rate per scrape).
+ * Keyed by the value of the `source` label (typically `local_cache_hit`,
+ * `external_cache_hit`, `miss`, etc.). Plot as stacked area.
+ */
+ promptTokensBySource: Record;
+ /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */
+ prefillTps: TimeSeriesPoint[];
+ /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
+ decodeTps: TimeSeriesPoint[];
+}
+
+interface RawMetaRow extends PointMeta {
+ blob: Buffer | null;
+ chart_series: ChartSeries | null;
+}
+
+function buildMeta(row: RawMetaRow): PointMeta {
+ return {
+ id: Number(row.id),
+ hardware: row.hardware,
+ framework: row.framework,
+ model: row.model,
+ precision: row.precision,
+ spec_method: row.spec_method,
+ disagg: row.disagg,
+ conc: row.conc,
+ offload_mode: row.offload_mode,
+ isl: row.isl,
+ osl: row.osl,
+ benchmark_type: row.benchmark_type,
+ date: row.date,
+ run_url: row.run_url,
+ server_gpu_cache_hit_rate:
+ row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate),
+ server_cpu_cache_hit_rate:
+ row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
+ };
+}
+
+function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
+ return {
+ meta,
+ startNs: series.startNs,
+ endNs: series.endNs,
+ durationS: series.durationS,
+ timeslicesCount: series.timeslicesCount,
+ kvCacheUsage: series.kvCacheUsage,
+ prefixCacheHitRate: series.prefixCacheHitRate,
+ queueDepth: series.queueDepth,
+ promptTokensBySource: series.promptTokensBySource,
+ prefillTps: series.prefillTps,
+ decodeTps: series.decodeTps,
+ };
+}
+
+export async function getTraceServerMetrics(
+ sql: DbClient,
+ benchmarkResultId: number,
+): Promise {
+ const rows = (await sql`
+ select
+ atr.server_metrics_json_gz as blob,
+ atr.chart_series,
+ br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+ br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+ br.date::text,
+ case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+ (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+ (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+ from benchmark_results br
+ join configs c on c.id = br.config_id
+ join workflow_runs wr on wr.id = br.workflow_run_id
+ left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+ where br.id = ${benchmarkResultId}
+ `) as unknown as RawMetaRow[];
+ const row = rows[0];
+ if (!row) return null;
+ if (!row.blob) return null;
+ const meta = buildMeta(row);
+
+ // Fast path: pre-computed chart_series at the current version.
+ if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
+ return merge(meta, row.chart_series);
+ }
+
+ // Slow path: compute from the blob. `computeChartSeries` handles
+ // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
+ // rows succeed even before the backfill drains them.
+ const series = await computeChartSeries(row.blob);
+ if (!series) return null;
+ return merge(meta, series);
+}
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index b4e4f255..d5e2d933 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise {
const rows = await sql`
- SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text
+ SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text
FROM availability a
WHERE EXISTS (
SELECT 1
@@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise= 0.4'}
+ stream-chain@3.6.3:
+ resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==}
+
stream-combiner@0.2.2:
resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==}
+ stream-json@2.1.0:
+ resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==}
+
string-width@4.2.3:
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
engines: {node: '>=8'}
@@ -7392,6 +7413,15 @@ snapshots:
'@types/stats.js@0.17.4': {}
+ '@types/stream-chain@2.1.0':
+ dependencies:
+ '@types/node': 25.7.0
+
+ '@types/stream-json@1.7.8':
+ dependencies:
+ '@types/node': 25.7.0
+ '@types/stream-chain': 2.1.0
+
'@types/three@0.184.1':
dependencies:
'@dimforge/rapier3d-compat': 0.12.0
@@ -10752,11 +10782,17 @@ snapshots:
es-errors: 1.3.0
internal-slot: 1.1.0
+ stream-chain@3.6.3: {}
+
stream-combiner@0.2.2:
dependencies:
duplexer: 0.1.2
through: 2.3.8
+ stream-json@2.1.0:
+ dependencies:
+ stream-chain: 3.6.3
+
string-width@4.2.3:
dependencies:
emoji-regex: 8.0.0