diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index f0e1692a..06f30f69 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -50,6 +50,7 @@ import ComparisonChangelog from './ComparisonChangelog'; import CustomCosts from './CustomCosts'; import CustomPowers from './CustomPowers'; import GPUGraph from './GPUGraph'; +import InteractivityTables from './InteractivityTables'; import ReplayLauncher, { type ReplayLauncherHandle } from '../replay/ReplayLauncher'; import TrendChart from './TrendChart'; @@ -642,6 +643,13 @@ export default function ChartDisplay() { )}
{displayGraphs}
+ {/* Summary tables below the Pareto chart. Render for every y-axis + metric; the tables auto-pick higher/lower-is-better semantics from + the active metric's roofline direction on the interactivity chart + definition. Reactive to model, precision, sequence and the legend + on/off toggles via useInference() context. */} + + {/* Performance Over Time — Modal Drill-Down */} { + it('renders 1.0× as near-neutral and produces dark text', () => { + const { background, color } = ratioColor(1); + expect(background).toMatch(/^rgb\(/u); + expect(color).toBe('#0a0a0a'); + }); + + it('produces visibly distinct colors for common positive ratios', () => { + // The whole point of bumping the cap from 3× to 30× and switching to HSL: + // common ratios from 2× up through 20× must land at clearly different + // greens rather than all saturating to the same deep color. + const ratios = [2, 5, 7, 10, 20]; + const backgrounds = ratios.map((r) => ratioColor(r).background); + expect(new Set(backgrounds).size).toBe(ratios.length); + }); + + it('produces a monotonically darker green for higher ratios (higher-better)', () => { + // Each step up in ratio should reduce HSL lightness (=> lower luminance) + // until the saturation cap. Use a coarse luminance proxy via the green + // channel of the rgb() string. + const greens = [1.5, 2, 5, 10, 20, 33].map((r) => { + const m = /rgb\((\d+),\s*(\d+),\s*(\d+)\)/u.exec(ratioColor(r).background); + if (!m) throw new Error('rgb parse failed'); + return Number(m[1]) + Number(m[2]) + Number(m[3]); // r+g+b as a luminance proxy + }); + for (let i = 1; i < greens.length; i++) { + expect(greens[i]).toBeLessThan(greens[i - 1]); + } + }); + + it('clamps beyond RATIO_CAP_HI / RATIO_CAP_LO', () => { + expect(ratioColor(RATIO_CAP_HI).background).toBe(ratioColor(RATIO_CAP_HI * 10).background); + expect(ratioColor(RATIO_CAP_LO).background).toBe(ratioColor(RATIO_CAP_LO / 10).background); + }); + + it('is log-symmetric: reciprocal ratios swap red/green at equal magnitude', () => { + // ratioColor(2) and ratioColor(0.5) should be mirror images (same lightness, + // opposite hues). Compare the dominant channel: 2× should be green-dominant + // (g > r), 0.5× should be red-dominant (r > g). + const up = /rgb\((\d+),\s*(\d+),\s*(\d+)\)/u.exec(ratioColor(2).background); + const down = /rgb\((\d+),\s*(\d+),\s*(\d+)\)/u.exec(ratioColor(0.5).background); + if (!up || !down) throw new Error('rgb parse failed'); + expect(Number(up[2])).toBeGreaterThan(Number(up[1])); + expect(Number(down[1])).toBeGreaterThan(Number(down[2])); + }); + + it("inverts hue for direction='lower'", () => { + // For lower-is-better, a ratio > 1 means "other is worse" → red. + const higher = ratioColor(5, 'higher'); + const lower = ratioColor(5, 'lower'); + const hi = /rgb\((\d+),\s*(\d+),\s*(\d+)\)/u.exec(higher.background); + const lo = /rgb\((\d+),\s*(\d+),\s*(\d+)\)/u.exec(lower.background); + if (!hi || !lo) throw new Error('rgb parse failed'); + // higher-better at 5× → green-dominant; lower-better at 5× → red-dominant. + expect(Number(hi[2])).toBeGreaterThan(Number(hi[1])); + expect(Number(lo[1])).toBeGreaterThan(Number(lo[2])); + }); + + it('switches text color to white once background luminance drops', () => { + // Deep ratios should produce white text (background too dark for black). + expect(ratioColor(30).color).toBe('#ffffff'); + expect(ratioColor(1 / 30).color).toBe('#ffffff'); + // Near 1×, text should stay dark. + expect(ratioColor(1.5).color).toBe('#0a0a0a'); + }); +}); diff --git a/packages/app/src/components/inference/ui/InteractivityTables.tsx b/packages/app/src/components/inference/ui/InteractivityTables.tsx new file mode 100644 index 00000000..511265fd --- /dev/null +++ b/packages/app/src/components/inference/ui/InteractivityTables.tsx @@ -0,0 +1,783 @@ +'use client'; + +import { useMemo, useState } from 'react'; +import { HelpCircle } from 'lucide-react'; + +import { useInference } from '@/components/inference/InferenceContext'; +import type { InferenceData } from '@/components/inference/types'; +import { Card } from '@/components/ui/card'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { + TooltipContent, + TooltipProvider, + TooltipRoot, + TooltipTrigger, +} from '@/components/ui/tooltip'; +import { track } from '@/lib/analytics'; +import { getHardwareConfig } from '@/lib/constants'; +import { getMetricParetoDirection } from '@/lib/metric-direction'; +import { + aucUnderFrontier, + aucWindow, + interpAlongFrontier, + paretoFrontier, + type ParetoDirection, + type Point2D, +} from '@/lib/pareto'; +import { cn, getDisplayLabel } from '@/lib/utils'; + +/** + * Default baseline preferences. We resolve to whichever enabled config best + * matches each name; if none match we fall back to the first enabled config. + */ +const DEFAULT_THROUGHPUT_BASELINE_HINTS = ['mi355x_sglang']; +const DEFAULT_AUC_PRIMARY_HINTS = ['b200_sglang']; +const DEFAULT_AUC_SECONDARY_HINTS = ['mi355x_sglang']; +const DEFAULT_AUC_TERTIARY_HINTS = ['mi355x_atom']; + +interface ConfigSeries { + hwKey: string; + label: string; + frontier: Point2D[]; +} + +/** + * Pick the enabled hwKey whose lowercase string contains all hint tokens + * (e.g. 'mi355x_sglang' matches 'mi355x_sglang' but NOT 'mi355x_sglang_mtp'). + * The hint should NOT match the `_mtp` variant by default — we prefer the + * non-MTP entry. Returns null when no enabled config matches. + */ +function pickDefaultBaseline( + enabledKeys: string[], + hints: string[], + excludeMtp = true, +): string | null { + for (const hint of hints) { + const lcHint = hint.toLowerCase(); + const match = enabledKeys.find((k) => { + const lc = k.toLowerCase(); + if (!lc.includes(lcHint)) return false; + if (excludeMtp && lc.endsWith('_mtp')) return false; + return true; + }); + if (match) return match; + } + return null; +} + +/** Format a number with the right scale for the chosen metric. */ +function formatValue(n: number): string { + if (!Number.isFinite(n)) return '—'; + const abs = Math.abs(n); + if (abs === 0) return '0'; + if (abs >= 1000) return Math.round(n).toLocaleString(); + if (abs >= 100) return n.toFixed(0); + if (abs >= 10) return n.toFixed(1); + if (abs >= 1) return n.toFixed(2); + if (abs >= 0.01) return n.toFixed(3); + return n.toExponential(2); +} + +function srgbToLinear(c: number): number { + const v = c / 255; + return v <= 0.03928 ? v / 12.92 : ((v + 0.055) / 1.055) ** 2.4; +} + +/** WCAG 2.x relative luminance for an sRGB color. */ +function relativeLuminance(r: number, g: number, b: number): number { + return 0.2126 * srgbToLinear(r) + 0.7152 * srgbToLinear(g) + 0.0722 * srgbToLinear(b); +} + +// Saturation endpoints for the ratio→color ramp. The dataset can show ratios +// up to ~30× between best and worst configs (e.g. GB300 vs MI355X SGL), so +// caps must be wide enough that common ratios (2×, 5×, 10×, 20×) sit at +// visibly different points on the ramp rather than all clamping to the same +// extreme. Stays log-symmetric: t=+1 at RATIO_CAP_HI, t=-1 at RATIO_CAP_LO. +export const RATIO_CAP_HI = 30; +export const RATIO_CAP_LO = 1 / 30; + +/** HSL → RGB. h in [0, 360), s/l in [0, 1]. Returns integer [0,255] channels. */ +function hslToRgb(h: number, s: number, l: number): { r: number; g: number; b: number } { + const c = (1 - Math.abs(2 * l - 1)) * s; + const hp = h / 60; + const x = c * (1 - Math.abs((hp % 2) - 1)); + let r1 = 0; + let g1 = 0; + let b1 = 0; + if (hp < 1) { + r1 = c; + g1 = x; + } else if (hp < 2) { + r1 = x; + g1 = c; + } else if (hp < 3) { + g1 = c; + b1 = x; + } else if (hp < 4) { + g1 = x; + b1 = c; + } else if (hp < 5) { + r1 = x; + b1 = c; + } else { + r1 = c; + b1 = x; + } + const m = l - c / 2; + return { + r: Math.round((r1 + m) * 255), + g: Math.round((g1 + m) * 255), + b: Math.round((b1 + m) * 255), + }; +} + +// HSL endpoints. Lightness ramps 0.97 (near-white at t=0) down to 0.28 (deep +// color at |t|=1); saturation eases up so the deep end stays vivid. RGB +// interpolation collapses perceptually between green-300 and green-700, so we +// drive the ramp in HSL instead — this is what gives 5× / 10× / 20× / 33× +// visibly different greens. +const HUE_GREEN = 142; // tailwind green-ish +const HUE_RED = 0; +const L_NEUTRAL = 0.97; +const L_DEEP = 0.28; +const S_NEUTRAL = 0.6; +const S_DEEP = 0.78; + +/** + * Map a ratio (other / baseline) to a red→white→green color, centered at 1.0× + * and log-symmetric. + * + * For 'higher' (default): ratio = 1 → near-white; ratio ≥ RATIO_CAP_HI → deep + * green; ratio ≤ RATIO_CAP_LO → deep red. + * + * For 'lower': INVERT — ratio ≤ RATIO_CAP_LO → deep green; ratio ≥ + * RATIO_CAP_HI → deep red. + * + * Returns { background, color } with the WCAG-derived text color. + */ +export function ratioColor( + ratio: number, + direction: ParetoDirection = 'higher', +): { background: string; color: string } { + const clamped = Math.max(RATIO_CAP_LO, Math.min(RATIO_CAP_HI, ratio)); + // log-symmetric t in [-1, 1]: t=0 at 1.0, t=+1 at cap-hi, t=-1 at cap-lo. + let t = Math.log(clamped) / Math.log(RATIO_CAP_HI); + // For lower-is-better, flip the sign so ratio > 1 → red and ratio < 1 → green. + if (direction === 'lower') t = -t; + const magnitude = Math.abs(t); + const hue = t >= 0 ? HUE_GREEN : HUE_RED; + const lightness = L_NEUTRAL + (L_DEEP - L_NEUTRAL) * magnitude; + const saturation = S_NEUTRAL + (S_DEEP - S_NEUTRAL) * magnitude; + const { r, g, b } = hslToRgb(hue, saturation, lightness); + const lum = relativeLuminance(r, g, b); + const color = lum > 0.45 ? '#0a0a0a' : '#ffffff'; + return { background: `rgb(${r}, ${g}, ${b})`, color }; +} + +const INFINITY_GREEN_BG = '#14532d'; // dark green (green-900) +const INFINITY_RED_BG = '#7f1d1d'; // dark red (red-900) +const SELF_BG = '#fbbf24'; // amber-400 for baseline-vs-self +const COL_BEST_BG = '#bbf7d0'; // green-200 for best per column in main table + +/** + * Build per-config Pareto frontiers from filtered InferenceData. Filters by + * selected precisions + active legend toggles, then groups by hwKey and runs + * the shared 2-D Pareto algorithm on (x, y) = (interactivity, selected metric). + * Direction is taken from the active y-metric's roofline direction. + */ +function useConfigSeries(direction: ParetoDirection): { + configs: ConfigSeries[]; + yLabel: string; + yTitle: string; +} { + const { graphs, activeHwTypes, selectedPrecisions, hardwareConfig, selectedYAxisMetric } = + useInference(); + return useMemo(() => { + const interactivityGraph = graphs.find((g) => g.chartDefinition.chartType === 'interactivity'); + if (!interactivityGraph) return { configs: [], yLabel: '', yTitle: '' }; + + const chartDef = interactivityGraph.chartDefinition; + const yLabel = + (chartDef[`${selectedYAxisMetric}_label` as keyof typeof chartDef] as string) || ''; + const yTitle = + (chartDef[`${selectedYAxisMetric}_title` as keyof typeof chartDef] as string) || ''; + + // Group filtered points by hwKey. + const byHw = new Map(); + for (const d of interactivityGraph.data) { + const hw = String(d.hwKey); + if (activeHwTypes.size > 0 && !activeHwTypes.has(hw)) continue; + if (!selectedPrecisions.includes(d.precision)) continue; + if (!Number.isFinite(d.x) || !Number.isFinite(d.y)) continue; + const arr = byHw.get(hw) ?? []; + arr.push(d); + byHw.set(hw, arr); + } + + const result: ConfigSeries[] = []; + for (const [hwKey, points] of byHw) { + if (points.length < 2) continue; + const frontier = paretoFrontier( + points.map((p) => ({ x: p.x, y: p.y })), + direction, + ); + if (frontier.length < 2) continue; + const hwConfig = hardwareConfig[hwKey] ?? getHardwareConfig(hwKey); + result.push({ hwKey, label: getDisplayLabel(hwConfig), frontier }); + } + // Order: same as legend (hardwareConfig insertion order, already sorted by + // model sort index in InferenceContext). + const order = Object.keys(hardwareConfig); + result.sort((a, b) => { + const ai = order.indexOf(a.hwKey); + const bi = order.indexOf(b.hwKey); + return (ai === -1 ? Infinity : ai) - (bi === -1 ? Infinity : bi); + }); + return { configs: result, yLabel, yTitle }; + }, [graphs, activeHwTypes, selectedPrecisions, hardwareConfig, selectedYAxisMetric, direction]); +} + +interface BaselineSelectProps { + value: string; + onChange: (next: string) => void; + configs: ConfigSeries[]; + label: string; + testId?: string; +} + +function BaselineSelect({ value, onChange, configs, label, testId }: BaselineSelectProps) { + return ( +
+ {label}: + +
+ ); +} + +function InfoIcon({ text }: { text: string }) { + return ( + + + + + + {text} + + + ); +} + +/** Per-interactivity value table + linked ratio heatmap. */ +function ValueAndDiffTable({ + configs, + direction, + yLabel, + yTitle, +}: { + configs: ConfigSeries[]; + direction: ParetoDirection; + yLabel: string; + yTitle: string; +}) { + const higherBetter = direction === 'higher'; + // Compute buckets: every 10 from 10 up through floor(globalMax / 10) * 10. + const buckets = useMemo(() => { + let globalMax = 0; + for (const c of configs) { + const maxX = c.frontier.at(-1)?.x ?? 0; + if (maxX > globalMax) globalMax = maxX; + } + const hi = Math.floor(globalMax / 10) * 10; + const out: number[] = []; + for (let v = 10; v <= hi; v += 10) out.push(v); + return out; + }, [configs]); + + // Per-(config, bucket) value cell, with the column-best highlight. + const valueCells = useMemo(() => { + const grid: (number | null)[][] = configs.map((c) => + buckets.map((b) => interpAlongFrontier(c.frontier, b, direction)), + ); + const colBestRow: (number | null)[] = buckets.map((_, ci) => { + let best: number | null = null; + for (const row of grid) { + const v = row[ci]; + if (v === null) continue; + if (best === null) { + best = v; + continue; + } + if (higherBetter ? v > best : v < best) best = v; + } + return best; + }); + return { grid, colBestRow }; + }, [configs, buckets, direction, higherBetter]); + + // Baseline selection for the ratio sub-table. + const enabledKeys = configs.map((c) => c.hwKey); + const defaultBaseline = + pickDefaultBaseline(enabledKeys, DEFAULT_THROUGHPUT_BASELINE_HINTS) ?? enabledKeys[0] ?? ''; + const [baselineKey, setBaselineKey] = useState(defaultBaseline); + // If the previously-picked baseline isn't enabled anymore, snap to the default. + const effectiveBaseline = enabledKeys.includes(baselineKey) ? baselineKey : defaultBaseline; + const baselineRow = useMemo(() => { + const idx = configs.findIndex((c) => c.hwKey === effectiveBaseline); + if (idx === -1) return null; + return valueCells.grid[idx]; + }, [configs, valueCells, effectiveBaseline]); + + const directionHint = higherBetter ? 'Higher is better.' : 'Lower is better.'; + const valueTooltip = + `For each enabled config we compute the Pareto frontier of ${yTitle || 'the selected metric'} vs interactivity, ` + + `then read off the value at every 10 tok/s/user step. Em-dash means that interactivity is outside the config's reachable range. ` + + `Best value per column is highlighted in green. ${directionHint}`; + + const ratioTooltip = higherBetter + ? 'other / baseline at each bucket, rendered as Nx. "∞" means the baseline cannot reach that interactivity but the other config can (green = good for other); "0×" the reverse (red); "—" means neither can. Color scale is centered at 1.00× and log-symmetric, saturating at 3.00× (green) and 0.33× (red).' + : 'other / baseline at each bucket, rendered as Nx. Since lower is better, color is INVERTED: ratios < 1 are green (other uses less than baseline = good) and ratios > 1 are red. "∞" means the baseline cannot reach that interactivity but the other config can — colored red (other is way worse / infinite cost relative to baseline); "0×" the reverse — colored green (other achieves zero relative to baseline = great); "—" means neither can. Saturation caps at 3.00× and 0.33×.'; + + return ( + +
+
+

Per-GPU value at each interactivity bucket

+ +
+
+

+ Linearly interpolated {yLabel || 'metric value'} along each config's Pareto frontier. + Reactive to model, precision, sequence and the legend on/off toggles above. {directionHint} +

+ + {configs.length === 0 ? ( +

+ Enable at least one configuration in the legend to populate the tables. +

+ ) : ( +
+ + + + + {buckets.map((b) => ( + + ))} + + + + + + + {configs.map((c, ri) => ( + + + {buckets.map((b, ci) => { + const v = valueCells.grid[ri][ci]; + if (v === null) { + return ( + + ); + } + const isBest = valueCells.colBestRow[ci] === v; + return ( + + ); + })} + + ))} + +
+ Config + + {b} +
+ Interactivity (tok/s/user) → + +
+ {c.label} + + — + + {formatValue(v)} +
+
+ )} + + {configs.length > 0 && ( +
+
+
+

Ratio vs baseline

+ +
+ { + setBaselineKey(v); + track('inference_throughput_baseline_changed', { baseline: v }); + }} + testId="throughput-baseline-select" + /> +
+
+ + + + + {buckets.map((b) => ( + + ))} + + + + {configs.map((c, ri) => ( + + + {buckets.map((b, ci) => { + const other = valueCells.grid[ri][ci]; + const baseline = baselineRow ? baselineRow[ci] : null; + const isSelf = c.hwKey === effectiveBaseline; + + if (isSelf) { + return ( + + ); + } + + if (other === null && baseline === null) { + return ( + + ); + } + // Baseline can't reach, other can: + // - higher-better: "infinite multiplier of throughput" → great for other → green + // - lower-better: "infinite multiplier of cost" → bad for other → red + if (other !== null && baseline === null) { + const bg = higherBetter ? INFINITY_GREEN_BG : INFINITY_RED_BG; + return ( + + ); + } + // Other can't reach, baseline can: + // - higher-better: other is 0× → bad for other → red + // - lower-better: other is 0× cost → great for other → green + if (other === null && baseline !== null) { + const bg = higherBetter ? INFINITY_RED_BG : INFINITY_GREEN_BG; + return ( + + ); + } + const ratio = other! / baseline!; + const { background, color } = ratioColor(ratio, direction); + return ( + + ); + })} + + ))} + +
+ Config + + {b} +
+ {c.label} + + 1.00× + + — + + ∞ + + 0× + + {ratio.toFixed(2)}× +
+
+
+ )} +
+ ); +} + +/** AUC summary table with three baseline columns. */ +function AucSummaryTable({ + configs, + direction, + yLabel, +}: { + configs: ConfigSeries[]; + direction: ParetoDirection; + yLabel: string; +}) { + const higherBetter = direction === 'higher'; + const hi = useMemo(() => { + let globalMax = 0; + for (const c of configs) { + const maxX = c.frontier.at(-1)?.x ?? 0; + if (maxX > globalMax) globalMax = maxX; + } + return Math.floor(globalMax / 10) * 10; + }, [configs]); + + const aucs = useMemo( + () => configs.map((c) => aucUnderFrontier(c.frontier, 10, hi, direction)), + [configs, hi, direction], + ); + + // Per-config integration window — for lower-is-better this may shrink to + // the reachable x-range; for higher-is-better it's always [10, hi]. + const aucWindows = useMemo( + () => configs.map((c) => aucWindow(c.frontier, 10, hi, direction)), + [configs, hi, direction], + ); + + const enabledKeys = configs.map((c) => c.hwKey); + const defaultPrimary = + pickDefaultBaseline(enabledKeys, DEFAULT_AUC_PRIMARY_HINTS) ?? enabledKeys[0] ?? ''; + const defaultSecondary = + pickDefaultBaseline(enabledKeys, DEFAULT_AUC_SECONDARY_HINTS) ?? enabledKeys[0] ?? ''; + const defaultTertiary = + pickDefaultBaseline(enabledKeys, DEFAULT_AUC_TERTIARY_HINTS, false) ?? enabledKeys[0] ?? ''; + + const [primary, setPrimary] = useState(defaultPrimary); + const [secondary, setSecondary] = useState(defaultSecondary); + const [tertiary, setTertiary] = useState(defaultTertiary); + + const eff = (s: string, d: string) => (enabledKeys.includes(s) ? s : d); + const ePrimary = eff(primary, defaultPrimary); + const eSecondary = eff(secondary, defaultSecondary); + const eTertiary = eff(tertiary, defaultTertiary); + + const baselineAuc = (key: string): number | null => { + const i = configs.findIndex((c) => c.hwKey === key); + return i === -1 ? null : aucs[i]; + }; + + const primaryAuc = baselineAuc(ePrimary); + const secondaryAuc = baselineAuc(eSecondary); + const tertiaryAuc = baselineAuc(eTertiary); + + const ratioCell = (auc: number, baseline: number | null, baselineKey: string, hwKey: string) => { + if (baseline === null || baseline === 0) return { text: '—', style: undefined }; + const ratio = auc / baseline; + if (hwKey === baselineKey) { + return { + text: '1.00×', + style: { backgroundColor: SELF_BG, color: '#0a0a0a' }, + }; + } + const { background, color } = ratioColor(ratio, direction); + return { + text: `${ratio.toFixed(2)}×`, + style: { backgroundColor: background, color }, + }; + }; + + const directionHint = higherBetter + ? 'Higher is better — a config that reaches both high interactivity AND high throughput-like value scores best.' + : 'Lower is better — a config that achieves low cost / energy across the reachable interactivity range scores best.'; + + const outOfRangeHint = higherBetter + ? "Outside a config's reachable interactivity range the integrand is treated as 0 (worst case for higher-is-better)." + : "Integration is restricted to each config's reachable interactivity range. The per-row window is shown below the AUC."; + + const aucTooltip = + `Trapezoidal area under each config's ${yLabel || 'selected metric'} vs interactivity Pareto frontier, integrated from 10 to ${hi} tok/s/user. ` + + `${outOfRangeHint} ${directionHint}`; + + return ( + +
+

Area under Pareto frontier (AUC summary)

+ +
+

+ Integration window: 10 → {hi} tok/s/user. {directionHint} +

+ + {configs.length === 0 ? ( +

+ Enable at least one configuration in the legend to populate the AUC summary. +

+ ) : ( + <> +
+ { + setPrimary(v); + track('inference_auc_primary_baseline_changed', { baseline: v }); + }} + testId="auc-primary-baseline-select" + /> + { + setSecondary(v); + track('inference_auc_secondary_baseline_changed', { baseline: v }); + }} + testId="auc-secondary-baseline-select" + /> + { + setTertiary(v); + track('inference_auc_tertiary_baseline_changed', { baseline: v }); + }} + testId="auc-tertiary-baseline-select" + /> +
+
+ + + + + + {!higherBetter && ( + + )} + + + + + + + {configs.map((c, i) => { + const auc = aucs[i]; + const win = aucWindows[i]; + const primaryR = ratioCell(auc, primaryAuc, ePrimary, c.hwKey); + const secondaryR = ratioCell(auc, secondaryAuc, eSecondary, c.hwKey); + const tertiaryR = ratioCell(auc, tertiaryAuc, eTertiary, c.hwKey); + return ( + + + + {!higherBetter && ( + + )} + + + + + ); + })} + +
ConfigAUCWindow + Ratio vs primary + + Ratio vs secondary + + Ratio vs tertiary +
+ {c.label} + {formatValue(auc)} + {win ? `${win.lo}→${win.hi}` : '—'} + + {primaryR.text} + + {secondaryR.text} + + {tertiaryR.text} +
+
+ + )} +
+ ); +} + +/** + * Section that renders the two summary tables below the Pareto chart on the + * inference page. Renders for all y-axis metrics; the "is higher better" + * direction is taken from the active metric's roofline direction on the + * interactivity chart definition. + */ +export default function InteractivityTables() { + const { selectedYAxisMetric, graphs } = useInference(); + + const interactivityGraph = graphs.find((g) => g.chartDefinition.chartType === 'interactivity'); + const direction: ParetoDirection = interactivityGraph + ? getMetricParetoDirection(interactivityGraph.chartDefinition, selectedYAxisMetric) + : 'higher'; + + const { configs, yLabel, yTitle } = useConfigSeries(direction); + + if (!interactivityGraph) return null; + + return ( + <> + + + + ); +} diff --git a/packages/app/src/lib/__fixtures__/eight_config_data.json b/packages/app/src/lib/__fixtures__/eight_config_data.json new file mode 100644 index 00000000..e18fbcb2 --- /dev/null +++ b/packages/app/src/lib/__fixtures__/eight_config_data.json @@ -0,0 +1,420 @@ +{ + "MI355X_SGLang_nonMTP": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 42.4425, + "Token_Throughput_per_GPU_tok_s_gpu": 43.927, + "Median_TTFT_ms": 1.8887 + }, + { + "Conc": 2, + "Interactivity_tok_s_user": 40.0599, + "Token_Throughput_per_GPU_tok_s_gpu": 83.1647, + "Median_TTFT_ms": 1.837 + }, + { + "Conc": 4, + "Interactivity_tok_s_user": 32.8412, + "Token_Throughput_per_GPU_tok_s_gpu": 135.9584, + "Median_TTFT_ms": 1.5697 + }, + { + "Conc": 8, + "Interactivity_tok_s_user": 28.2131, + "Token_Throughput_per_GPU_tok_s_gpu": 233.3679, + "Median_TTFT_ms": 1.4875 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 20.0572, + "Token_Throughput_per_GPU_tok_s_gpu": 336.0692, + "Median_TTFT_ms": 1.4909 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 20.1404, + "Token_Throughput_per_GPU_tok_s_gpu": 302.1082, + "Median_TTFT_ms": 4.7495 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 16.5069, + "Token_Throughput_per_GPU_tok_s_gpu": 488.2661, + "Median_TTFT_ms": 5.2389 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 15.0528, + "Token_Throughput_per_GPU_tok_s_gpu": 802.9119, + "Median_TTFT_ms": 14.1662 + }, + { + "Conc": 128, + "Interactivity_tok_s_user": 10.121, + "Token_Throughput_per_GPU_tok_s_gpu": 1194.3396, + "Median_TTFT_ms": 16.3291 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 6.0659, + "Token_Throughput_per_GPU_tok_s_gpu": 1503.2389, + "Median_TTFT_ms": 19.203 + } + ], + "MI355X_ATOM_nonMTP": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 43.3401, + "Token_Throughput_per_GPU_tok_s_gpu": 47.4194, + "Median_TTFT_ms": 0.4657 + }, + { + "Conc": 2, + "Interactivity_tok_s_user": 41.1286, + "Token_Throughput_per_GPU_tok_s_gpu": 89.3156, + "Median_TTFT_ms": 0.4643 + }, + { + "Conc": 4, + "Interactivity_tok_s_user": 39.377, + "Token_Throughput_per_GPU_tok_s_gpu": 168.4226, + "Median_TTFT_ms": 0.4865 + }, + { + "Conc": 8, + "Interactivity_tok_s_user": 35.9213, + "Token_Throughput_per_GPU_tok_s_gpu": 307.4319, + "Median_TTFT_ms": 0.4701 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 29.9705, + "Token_Throughput_per_GPU_tok_s_gpu": 512.6047, + "Median_TTFT_ms": 0.4759 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 23.9073, + "Token_Throughput_per_GPU_tok_s_gpu": 814.9395, + "Median_TTFT_ms": 0.4957 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 16.6093, + "Token_Throughput_per_GPU_tok_s_gpu": 1162.8702, + "Median_TTFT_ms": 0.6299 + }, + { + "Conc": 128, + "Interactivity_tok_s_user": 10.4412, + "Token_Throughput_per_GPU_tok_s_gpu": 1469.8935, + "Median_TTFT_ms": 0.6871 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 2.3998, + "Token_Throughput_per_GPU_tok_s_gpu": 704.7307, + "Median_TTFT_ms": 3.5858 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 3.7953, + "Token_Throughput_per_GPU_tok_s_gpu": 2138.47, + "Median_TTFT_ms": 1.7068 + } + ], + "B200_SGLang_nonMTP": [ + { + "Conc": 2, + "Interactivity_tok_s_user": 80.2493, + "Token_Throughput_per_GPU_tok_s_gpu": 145.0523, + "Median_TTFT_ms": 0.454 + }, + { + "Conc": 4, + "Interactivity_tok_s_user": 70.4374, + "Token_Throughput_per_GPU_tok_s_gpu": 261.4948, + "Median_TTFT_ms": 0.4077 + }, + { + "Conc": 8, + "Interactivity_tok_s_user": 60.7308, + "Token_Throughput_per_GPU_tok_s_gpu": 513.6405, + "Median_TTFT_ms": 0.3958 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 47.7448, + "Token_Throughput_per_GPU_tok_s_gpu": 816.2807, + "Median_TTFT_ms": 0.3986 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 34.3571, + "Token_Throughput_per_GPU_tok_s_gpu": 1169.9087, + "Median_TTFT_ms": 0.4118 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 19.183, + "Token_Throughput_per_GPU_tok_s_gpu": 1330.0161, + "Median_TTFT_ms": 0.8243 + }, + { + "Conc": 128, + "Interactivity_tok_s_user": 13.233, + "Token_Throughput_per_GPU_tok_s_gpu": 1945.3725, + "Median_TTFT_ms": 0.8562 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 8.9919, + "Token_Throughput_per_GPU_tok_s_gpu": 2600.2499, + "Median_TTFT_ms": 0.9585 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 6.0656, + "Token_Throughput_per_GPU_tok_s_gpu": 3492.0547, + "Median_TTFT_ms": 1.1088 + } + ], + "B200_DynamoVLLM_nonMTP_disagg": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 80.1069, + "Token_Throughput_per_GPU_tok_s_gpu": 29.4083, + "Median_TTFT_ms": 3.8403 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 53.1696, + "Token_Throughput_per_GPU_tok_s_gpu": 391.6534, + "Median_TTFT_ms": 3.328 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 40.7967, + "Token_Throughput_per_GPU_tok_s_gpu": 614.4892, + "Median_TTFT_ms": 3.3836 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 32.1821, + "Token_Throughput_per_GPU_tok_s_gpu": 1008.1675, + "Median_TTFT_ms": 3.2835 + }, + { + "Conc": 128, + "Interactivity_tok_s_user": 26.334, + "Token_Throughput_per_GPU_tok_s_gpu": 1660.2779, + "Median_TTFT_ms": 3.4014 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 19.5779, + "Token_Throughput_per_GPU_tok_s_gpu": 1753.8925, + "Median_TTFT_ms": 1.4311 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 18.2665, + "Token_Throughput_per_GPU_tok_s_gpu": 3195.7277, + "Median_TTFT_ms": 1.5909 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 17.3737, + "Token_Throughput_per_GPU_tok_s_gpu": 5801.349, + "Median_TTFT_ms": 2.9751 + }, + { + "Conc": 8192, + "Interactivity_tok_s_user": 14.8238, + "Token_Throughput_per_GPU_tok_s_gpu": 7329.1025, + "Median_TTFT_ms": 222.4298 + }, + { + "Conc": 12345, + "Interactivity_tok_s_user": 14.8342, + "Token_Throughput_per_GPU_tok_s_gpu": 7360.2266, + "Median_TTFT_ms": 369.2497 + } + ], + "GB200_DynamoVLLM_nonMTP_disagg": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 75.4019, + "Token_Throughput_per_GPU_tok_s_gpu": 32.7974, + "Median_TTFT_ms": 0.6629 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 24.2552, + "Token_Throughput_per_GPU_tok_s_gpu": 3147.9943, + "Median_TTFT_ms": 2.0077 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 32.4352, + "Token_Throughput_per_GPU_tok_s_gpu": 1613.8082, + "Median_TTFT_ms": 2.3161 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 21.0842, + "Token_Throughput_per_GPU_tok_s_gpu": 5336.1547, + "Median_TTFT_ms": 2.341 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 28.2006, + "Token_Throughput_per_GPU_tok_s_gpu": 2004.5428, + "Median_TTFT_ms": 17.6427 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 21.5425, + "Token_Throughput_per_GPU_tok_s_gpu": 6036.2244, + "Median_TTFT_ms": 40.5199 + }, + { + "Conc": 4096, + "Interactivity_tok_s_user": 15.092, + "Token_Throughput_per_GPU_tok_s_gpu": 8933.0452, + "Median_TTFT_ms": 51.7808 + }, + { + "Conc": 4096, + "Interactivity_tok_s_user": 18.402, + "Token_Throughput_per_GPU_tok_s_gpu": 8153.0641, + "Median_TTFT_ms": 117.6863 + } + ], + "GB200_DynamoVLLM_MTP_disagg": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 152.9557, + "Token_Throughput_per_GPU_tok_s_gpu": 143.2128, + "Median_TTFT_ms": 0.3757 + }, + { + "Conc": 16, + "Interactivity_tok_s_user": 99.465, + "Token_Throughput_per_GPU_tok_s_gpu": 269.0948, + "Median_TTFT_ms": 1.5557 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 83.1891, + "Token_Throughput_per_GPU_tok_s_gpu": 490.2363, + "Median_TTFT_ms": 1.3076 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 63.4528, + "Token_Throughput_per_GPU_tok_s_gpu": 721.1578, + "Median_TTFT_ms": 1.5374 + }, + { + "Conc": 128, + "Interactivity_tok_s_user": 44.0639, + "Token_Throughput_per_GPU_tok_s_gpu": 2584.5112, + "Median_TTFT_ms": 2.502 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 16.4509, + "Token_Throughput_per_GPU_tok_s_gpu": 5781.1445, + "Median_TTFT_ms": 2.4078 + } + ], + "GB300_DynamoSGLang_nonMTP_disagg": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 92.0641, + "Token_Throughput_per_GPU_tok_s_gpu": 94.053, + "Median_TTFT_ms": 0.9646 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 47.2857, + "Token_Throughput_per_GPU_tok_s_gpu": 3106.1871, + "Median_TTFT_ms": 113.8477 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 37.9203, + "Token_Throughput_per_GPU_tok_s_gpu": 7099.6766, + "Median_TTFT_ms": 8.9875 + }, + { + "Conc": 4096, + "Interactivity_tok_s_user": 26.3267, + "Token_Throughput_per_GPU_tok_s_gpu": 9599.2883, + "Median_TTFT_ms": 31.6256 + }, + { + "Conc": 8192, + "Interactivity_tok_s_user": 22.3924, + "Token_Throughput_per_GPU_tok_s_gpu": 10419.6758, + "Median_TTFT_ms": 64.9696 + }, + { + "Conc": 21504, + "Interactivity_tok_s_user": 11.0323, + "Token_Throughput_per_GPU_tok_s_gpu": 11444.0756, + "Median_TTFT_ms": 92.2394 + } + ], + "GB300_DynamoSGLang_MTP_disagg": [ + { + "Conc": 1, + "Interactivity_tok_s_user": 173.3876, + "Token_Throughput_per_GPU_tok_s_gpu": 161.3425, + "Median_TTFT_ms": 0.9401 + }, + { + "Conc": 8, + "Interactivity_tok_s_user": 160.0061, + "Token_Throughput_per_GPU_tok_s_gpu": 289.7123, + "Median_TTFT_ms": 1.6635 + }, + { + "Conc": 32, + "Interactivity_tok_s_user": 135.693, + "Token_Throughput_per_GPU_tok_s_gpu": 688.7133, + "Median_TTFT_ms": 4.0586 + }, + { + "Conc": 64, + "Interactivity_tok_s_user": 116.0557, + "Token_Throughput_per_GPU_tok_s_gpu": 1226.2824, + "Median_TTFT_ms": 4.8214 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 85.8193, + "Token_Throughput_per_GPU_tok_s_gpu": 2652.9551, + "Median_TTFT_ms": 24.9809 + }, + { + "Conc": 256, + "Interactivity_tok_s_user": 70.3439, + "Token_Throughput_per_GPU_tok_s_gpu": 3884.9172, + "Median_TTFT_ms": 23.7946 + }, + { + "Conc": 512, + "Interactivity_tok_s_user": 58.2314, + "Token_Throughput_per_GPU_tok_s_gpu": 6229.1466, + "Median_TTFT_ms": 19.6604 + }, + { + "Conc": 1024, + "Interactivity_tok_s_user": 49.6076, + "Token_Throughput_per_GPU_tok_s_gpu": 7564.4013, + "Median_TTFT_ms": 22.5606 + } + ] +} diff --git a/packages/app/src/lib/metric-direction.ts b/packages/app/src/lib/metric-direction.ts new file mode 100644 index 00000000..6a4d0511 --- /dev/null +++ b/packages/app/src/lib/metric-direction.ts @@ -0,0 +1,60 @@ +/** + * Single source of truth for whether a chart Y-axis metric is "higher is + * better" or "lower is better". + * + * The chart config (inference-chart-config.json) already declares this per + * metric via the roofline direction field (`y__roofline`): + * - 'upper_right' / 'upper_left' → higher-is-better + * - 'lower_right' / 'lower_left' → lower-is-better + * + * This module exposes a helper for non-chart consumers (tables, AUC, etc) + * that need the same direction info without re-reading the JSON. + */ + +import type { ChartDefinition } from '@/components/inference/types'; + +import type { ParetoDirection } from './pareto'; + +export type RooflineDirection = 'upper_right' | 'upper_left' | 'lower_left' | 'lower_right'; + +export function rooflineDirectionToPareto(dir: RooflineDirection | undefined): ParetoDirection { + if (dir === 'lower_left' || dir === 'lower_right') return 'lower'; + return 'higher'; +} + +export function isHigherBetter(dir: RooflineDirection | undefined): boolean { + return rooflineDirectionToPareto(dir) === 'higher'; +} + +/** + * Look up the roofline direction for a given Y-axis metric on a given chart + * definition. Returns the configured direction or undefined when the chart + * has no mapping for that metric. + */ +export function getMetricRooflineDirection( + chartDef: ChartDefinition, + yAxisMetric: string, +): RooflineDirection | undefined { + const key = `${yAxisMetric}_roofline` as keyof ChartDefinition; + const val = chartDef[key]; + if ( + val === 'upper_right' || + val === 'upper_left' || + val === 'lower_left' || + val === 'lower_right' + ) { + return val; + } + return undefined; +} + +/** + * Convenience: pareto direction for a metric on a chart definition. + * Defaults to 'higher' when unknown. + */ +export function getMetricParetoDirection( + chartDef: ChartDefinition, + yAxisMetric: string, +): ParetoDirection { + return rooflineDirectionToPareto(getMetricRooflineDirection(chartDef, yAxisMetric)); +} diff --git a/packages/app/src/lib/pareto.test.ts b/packages/app/src/lib/pareto.test.ts new file mode 100644 index 00000000..d6f67069 --- /dev/null +++ b/packages/app/src/lib/pareto.test.ts @@ -0,0 +1,351 @@ +import { describe, expect, it } from 'vitest'; + +import { + aucUnderFrontier, + aucWindow, + interpAlongFrontier, + paretoFrontier, + type Point2D, +} from '@/lib/pareto'; + +import eightConfigData from './__fixtures__/eight_config_data.json'; + +interface RawPoint { + Conc: number; + Interactivity_tok_s_user: number; + Token_Throughput_per_GPU_tok_s_gpu: number; + Median_TTFT_ms: number; +} + +const toPoints = (raw: RawPoint[]): Point2D[] => + raw.map((p) => ({ x: p.Interactivity_tok_s_user, y: p.Token_Throughput_per_GPU_tok_s_gpu })); + +// Independent fine-grid trapezoidal reference. Matches the Python np.interp +// + np.trapezoid approach used in the original spec. Used by the sanity +// check below — kept out of `src/lib/pareto.ts` because the production +// implementation is the closed-form piecewise integral, which agrees with +// this to fp drift on piecewise-linear input. +function referenceAuc(frontier: Point2D[], lo: number, hi: number): number { + if (frontier.length === 0 || hi <= lo) return 0; + const minX = frontier[0].x; + const last = frontier.at(-1); + if (!last) return 0; + const maxX = last.x; + const N = 100_001; + const step = (hi - lo) / (N - 1); + const ys: number[] = []; + for (let i = 0; i < N; i++) { + const x = lo + i * step; + if (x < minX || x > maxX) { + ys.push(0); + continue; + } + let j = 0; + while (j < frontier.length - 1 && frontier[j + 1].x < x) j++; + const a = frontier[j]; + const b = frontier[Math.min(j + 1, frontier.length - 1)]; + if (b.x === a.x) { + ys.push(Math.max(a.y, b.y)); + } else { + const t = (x - a.x) / (b.x - a.x); + ys.push(a.y + t * (b.y - a.y)); + } + } + let area = 0; + for (let i = 0; i < ys.length - 1; i++) { + area += ((ys[i] + ys[i + 1]) / 2) * step; + } + return area; +} + +describe('paretoFrontier', () => { + it('returns empty for empty input', () => { + expect(paretoFrontier([])).toEqual([]); + }); + + it('keeps only non-dominated points and sorts ascending x (higher-is-better)', () => { + const pts: Point2D[] = [ + { x: 10, y: 100 }, + { x: 20, y: 90 }, // dominated by (10,100)? no — x is higher + { x: 5, y: 110 }, + { x: 15, y: 50 }, // dominated by (20,90) + { x: 30, y: 60 }, + ]; + const f = paretoFrontier(pts); + // non-dominated: (5,110), (10,100)?, (20,90), (30,60) + // (10,100) dominated by (5,110)? (5,110) has lower x but higher y → not dominated + // For "higher x AND higher y both better", (10,100) is dominated iff some point has + // x > 10 AND y > 100. (20,90)? no. (30,60)? no. So (10,100) is on the frontier. + expect(f.map((p) => p.x)).toEqual([5, 10, 20, 30]); + expect(f.map((p) => p.y)).toEqual([110, 100, 90, 60]); + }); + + // For lower-is-better, a point dominates iff x > other.x AND y < other.y. + // Frontier consists of points with no dominator. + it('keeps only non-dominated points (lower-is-better)', () => { + // Cost-like metric where less is better. Higher x is still better. + const pts: Point2D[] = [ + { x: 5, y: 1 }, + { x: 10, y: 0.5 }, // dominates (5, 1.0)? x=10>5 AND y=0.5<1.0 → YES, dominates + { x: 15, y: 0.8 }, // not dominated by (10, 0.5) since y=0.8 > 0.5; dominated by (20, 0.3)? yes + { x: 20, y: 0.3 }, + { x: 25, y: 0.6 }, // dominated by (20, 0.3)? x=20<25 → no; dominator would need x>25 AND y<0.6 + { x: 30, y: 0.4 }, // dominates (25, 0.6)? x=30>25 AND y=0.4<0.6 → yes + ]; + const f = paretoFrontier(pts, 'lower'); + // Walking: keep points where no other has x>p.x AND y10, y=0.3<0.5 → yes → drop + // (15,0.8): dominated by (20,0.3)? yes → drop + // (20,0.3): dominated? need x>20 AND y<0.3 — (30,0.4) no, (25,0.6) no → keep + // (25,0.6): dominated by (30,0.4)? yes → drop + // (30,0.4): dominated? need x>30 — none → keep + expect(f.map((p) => p.x)).toEqual([20, 30]); + expect(f.map((p) => p.y)).toEqual([0.3, 0.4]); + }); +}); + +describe('interpAlongFrontier', () => { + const f: Point2D[] = [ + { x: 10, y: 100 }, + { x: 20, y: 200 }, + { x: 50, y: 350 }, + ]; + + it('returns null outside range', () => { + expect(interpAlongFrontier(f, 5)).toBeNull(); + expect(interpAlongFrontier(f, 100)).toBeNull(); + }); + + it('returns exact value at vertices', () => { + expect(interpAlongFrontier(f, 10)).toBe(100); + expect(interpAlongFrontier(f, 20)).toBe(200); + expect(interpAlongFrontier(f, 50)).toBe(350); + }); + + it('linearly interpolates between vertices', () => { + // midpoint of (10,100)-(20,200) → 15, 150 + expect(interpAlongFrontier(f, 15)).toBeCloseTo(150, 9); + // 1/3 of the way (20→50, 0→1/3) at x=30 → y = 200 + (30-20)/(50-20) * (350-200) = 200 + 50 = 250 + expect(interpAlongFrontier(f, 30)).toBeCloseTo(250, 9); + }); + + it('linearly interpolates the same way for lower-is-better frontiers', () => { + // Direction only affects which y wins at duplicate-x ties; here all x's + // are unique so the result is identical. + const lf: Point2D[] = [ + { x: 10, y: 1 }, + { x: 20, y: 0.5 }, + { x: 50, y: 0.2 }, + ]; + expect(interpAlongFrontier(lf, 15, 'lower')).toBeCloseTo(0.75, 9); + expect(interpAlongFrontier(lf, 50, 'lower')).toBe(0.2); + }); +}); + +describe('aucUnderFrontier', () => { + it('integrates a trivial triangle exactly', () => { + // frontier y=x from x=0..10, AUC over [0,10] = 50 + const f = [ + { x: 0, y: 0 }, + { x: 10, y: 10 }, + ]; + expect(aucUnderFrontier(f, 0, 10)).toBeCloseTo(50, 9); + }); + + it('zeros the integrand outside the frontier x-range (higher-better)', () => { + // frontier only covers x in [10, 20], integrate [0, 30] + const f = [ + { x: 10, y: 5 }, + { x: 20, y: 5 }, + ]; + // y=5 over x in [10,20] → AUC = 50. Outside that range y treated as 0. + expect(aucUnderFrontier(f, 0, 30)).toBeCloseTo(50, 9); + }); + + it('returns 0 when integration window is outside the frontier', () => { + const f = [ + { x: 10, y: 5 }, + { x: 20, y: 5 }, + ]; + expect(aucUnderFrontier(f, 30, 40)).toBe(0); + }); + + // For lower-is-better: integrate only over the reachable x-range. The + // result is identical to higher-better when the requested [lo, hi] is a + // strict subset of [minX, maxX] (no zero-pad region in either case), and + // differs only when the requested window extends beyond the frontier. + it('lower-better integrates only over reachable range', () => { + // frontier covers x in [10, 20] with constant y=2 + const f = [ + { x: 10, y: 2 }, + { x: 20, y: 2 }, + ]; + // Integrate the whole range — should give 20 (y=2 × span=10). + expect(aucUnderFrontier(f, 10, 20, 'lower')).toBeCloseTo(20, 9); + // Higher-better with window beyond range: zero-pads → still 20. + expect(aucUnderFrontier(f, 0, 30, 'higher')).toBeCloseTo(20, 9); + // Lower-better with the same window: clips to reachable [10, 20] → 20 too. + expect(aucUnderFrontier(f, 0, 30, 'lower')).toBeCloseTo(20, 9); + }); + + it('lower-better AUC matches reachable-only window, not zero-padded', () => { + // Non-flat lower-better frontier: cost falls then rises. + const f = [ + { x: 10, y: 1 }, + { x: 20, y: 0.5 }, + { x: 30, y: 0.4 }, + ]; + // Requested [0, 50]: lower-better should clip to [10, 30]. + // Trapezoid (10→20): (1.0+0.5)/2 * 10 = 7.5 + // Trapezoid (20→30): (0.5+0.4)/2 * 10 = 4.5 + // Total: 12 + expect(aucUnderFrontier(f, 0, 50, 'lower')).toBeCloseTo(12, 9); + + // Higher-better with same window would zero-pad [0,10] and [30,50], + // adding 0 contribution there, so total is also 12 — but the SEMANTICS + // differ. Verify by changing a range where higher-better differs: + // Pretend the frontier extends y outwards by adding 0-pad ranges: + // For higher-better, [0,50] integrates the same 12 (zero outside). + expect(aucUnderFrontier(f, 0, 50, 'higher')).toBeCloseTo(12, 9); + }); +}); + +describe('aucWindow', () => { + const f: Point2D[] = [ + { x: 10, y: 5 }, + { x: 30, y: 8 }, + ]; + + it('returns the requested window for higher-better', () => { + expect(aucWindow(f, 0, 50, 'higher')).toEqual({ lo: 0, hi: 50 }); + }); + + it('clips to reachable range for lower-better', () => { + expect(aucWindow(f, 0, 50, 'lower')).toEqual({ lo: 10, hi: 30 }); + expect(aucWindow(f, 15, 25, 'lower')).toEqual({ lo: 15, hi: 25 }); + }); + + it('returns null when reachable window is empty', () => { + expect(aucWindow(f, 40, 50, 'lower')).toBeNull(); + }); +}); + +// Sanity-check the full pipeline (pareto → AUC) on the spec's 8-config +// sample dataset (FP4 DeepSeek V4 Pro, 8K/1K, TP=8) using the production +// integration window: [10, floor(globalMax / 10) * 10]. +// +// We re-derive the expected AUC for each config from first principles — +// independent trapezoidal integration over the same Pareto frontier — and +// assert that aucUnderFrontier matches. Hard-coding numeric expectations +// would bake in whichever upper bound the test was written against; this +// way the test continues to be a meaningful sanity check if the window +// rule changes again. +describe('matches independent trapezoidal AUCs on spec sample data', () => { + const allXs = (Object.values(eightConfigData) as RawPoint[][]).flatMap((rows) => + rows.map((r) => r.Interactivity_tok_s_user), + ); + const globalMax = Math.max(...allXs); + const upperBound = Math.floor(globalMax / 10) * 10; + const window: [number, number] = [10, upperBound]; + + const names = Object.keys(eightConfigData as Record); + for (const name of names) { + it(`${name} matches independent reference (higher-better)`, () => { + const raw = (eightConfigData as Record)[name]; + expect(raw, `fixture missing ${name}`).toBeTruthy(); + const f = paretoFrontier(toPoints(raw)); + const auc = aucUnderFrontier(f, window[0], window[1]); + const expected = referenceAuc(f, window[0], window[1]); + // Both methods are trapezoidal on the same piecewise-linear function; + // they should agree to within tiny floating-point drift. + expect(Math.abs(auc - expected) / Math.max(expected, 1)).toBeLessThan(0.001); + }); + } +}); + +// Synthetic lower-is-better fixture — cost-per-token style metric across +// three configs. Verifies the direction-aware path end-to-end: +// pareto → interp → AUC and the window clipping. +describe('lower-is-better integration (synthetic cost fixture)', () => { + // Treat y as $/M tokens (lower = better). x is interactivity. + const configs: Record = { + // "Cheap-fast": low cost, broad interactivity range — should dominate. + cheap: [ + { x: 10, y: 0.5 }, + { x: 25, y: 0.4 }, + { x: 50, y: 0.6 }, + { x: 80, y: 1.2 }, + ], + // "Expensive-slow": consistently higher cost, narrower range. + expensive: [ + { x: 15, y: 1.5 }, + { x: 30, y: 1.2 }, + { x: 45, y: 1 }, + { x: 60, y: 1.3 }, + ], + // "Niche": only reaches very high interactivity. Cost dips then rises so + // the lower-better frontier keeps multiple points. + niche: [ + { x: 60, y: 0.9 }, + { x: 80, y: 0.5 }, + { x: 100, y: 0.7 }, + ], + }; + + it('pareto frontiers prune dominated points correctly', () => { + const cheap = paretoFrontier(configs.cheap, 'lower'); + // For 'cheap': dominator needs x>p.x AND y10, y<0.5. (25,0.4) qualifies → drop (10,0.5)? Yes. + // (25,0.4): need x>25 AND y<0.4. (50,0.6) no, (80,1.2) no → keep + // (50,0.6): need x>50 AND y<0.6. (80,1.2) no → keep + // (80,1.2): need x>80 — none → keep + expect(cheap.map((p) => p.x)).toEqual([25, 50, 80]); + + const expensive = paretoFrontier(configs.expensive, 'lower'); + // (15,1.5): (30,1.2) dominates → drop + // (30,1.2): (45,1.0) dominates → drop + // (45,1.0): need x>45, y<1.0 — (60,1.3) no → keep + // (60,1.3): keep + expect(expensive.map((p) => p.x)).toEqual([45, 60]); + }); + + it('AUC is restricted to reachable window for each config', () => { + const cheap = paretoFrontier(configs.cheap, 'lower'); + const niche = paretoFrontier(configs.niche, 'lower'); + + // For cheap, reachable x: [25, 80]. Common window [10, 100] clips. + const cheapWin = aucWindow(cheap, 10, 100, 'lower'); + expect(cheapWin).toEqual({ lo: 25, hi: 80 }); + + // For niche, the lower-better frontier prunes the (60, 0.9) point + // (dominated by (80, 0.5)). Reachable x range becomes [80, 100]. + const nicheWin = aucWindow(niche, 10, 100, 'lower'); + expect(nicheWin).toEqual({ lo: 80, hi: 100 }); + + // AUCs: + // cheap: (25,0.4)→(50,0.6)→(80,1.2). Trapezoids: + // 25→50: (0.4+0.6)/2*25 = 12.5 + // 50→80: (0.6+1.2)/2*30 = 27 + // total = 39.5 + expect(aucUnderFrontier(cheap, 10, 100, 'lower')).toBeCloseTo(39.5, 6); + + // niche frontier: (80,0.5)→(100,0.7). Trapezoid (80→100): + // (0.5+0.7)/2 * 20 = 12 + expect(aucUnderFrontier(niche, 10, 100, 'lower')).toBeCloseTo(12, 6); + }); + + it('interpolation respects lower-better best at duplicate x', () => { + // Construct a frontier with duplicate x to verify min vs max selection. + const f: Point2D[] = [ + { x: 10, y: 1 }, + { x: 20, y: 0.5 }, + { x: 20, y: 0.7 }, // wouldn't naturally appear post-frontier, but the + // helper should still return the better (min) y for lower-better. + ]; + // For lower direction at duplicate x, prefer min y. + expect(interpAlongFrontier(f, 20, 'lower')).toBe(0.5); + // For higher direction, prefer max y. + expect(interpAlongFrontier(f, 20, 'higher')).toBe(0.7); + }); +}); diff --git a/packages/app/src/lib/pareto.ts b/packages/app/src/lib/pareto.ts new file mode 100644 index 00000000..0e1e8d1e --- /dev/null +++ b/packages/app/src/lib/pareto.ts @@ -0,0 +1,200 @@ +/** + * Shared 2-D Pareto-frontier utilities for both "higher y is better" and + * "lower y is better" curves over an x-axis where higher x is always better + * (e.g. interactivity tok/s/user — more is more responsive). + * + * The chart layer has its own metric-aware helpers (calculateRoofline et al) + * that operate on full InferenceData points and `upper_left | upper_right | …` + * directions. This module is the plain numeric core — it works on + * `{ x, y }`-shaped points and is what tables / non-chart consumers should use. + * + * Direction parameter: + * - 'higher' (default): a point dominates iff x and y are BOTH greater. The + * visible frontier on an interactivity vs throughput chart looks like + * "upper-left" because as concurrency rises x falls while y rises. + * - 'lower': a point dominates iff x is greater AND y is LOWER. Used for + * cost / J / power metrics where less is more. + * + * Sorting note: the frontier is always returned in ascending-x order so + * downstream interp/AUC can treat the xs as a sorted grid. + */ + +export interface Point2D { + x: number; + y: number; +} + +export type ParetoDirection = 'higher' | 'lower'; + +/** + * Pareto frontier with direction control. Returns non-dominated points sorted + * by ascending x. + * + * For 'higher': a point is kept when no other has BOTH greater x AND greater y. + * For 'lower': a point is kept when no other has greater x AND LESSER y. + */ +export function paretoFrontier( + points: readonly T[], + direction: ParetoDirection = 'higher', +): T[] { + if (points.length === 0) return []; + // Sort by descending x. The point with max x is always kept; then walk down + // and keep any point whose y "beats" the running best y (max for 'higher', + // min for 'lower'). + const sorted = [...points].toSorted((a, b) => b.x - a.x); + const front: T[] = []; + if (direction === 'higher') { + let maxY = -Infinity; + for (const p of sorted) { + if (p.y > maxY) { + front.push(p); + maxY = p.y; + } + } + } else { + let minY = Infinity; + for (const p of sorted) { + if (p.y < minY) { + front.push(p); + minY = p.y; + } + } + } + // Return ascending x for downstream consumers. + return front.toSorted((a, b) => a.x - b.x); +} + +/** + * Linear interpolation along a frontier that's already sorted by ascending x. + * Returns null when x is outside [minX, maxX] of the frontier. + * + * Direction does not change the interpolation math — it only changes which + * vertex's y wins at duplicate-x ties (we pick whichever is "best" in the + * given direction). + */ +export function interpAlongFrontier( + frontier: readonly Point2D[], + x: number, + direction: ParetoDirection = 'higher', +): number | null { + const last = frontier.at(-1); + if (frontier.length === 0 || !last) return null; + const minX = frontier[0].x; + const maxX = last.x; + if (x < minX || x > maxX) return null; + if (frontier.length === 1) return frontier[0].y; + // Binary-search insertion point. + let lo = 0; + let hi = frontier.length - 1; + while (hi - lo > 1) { + const mid = (lo + hi) >>> 1; + if (frontier[mid].x <= x) lo = mid; + else hi = mid; + } + const a = frontier[lo]; + const b = frontier[hi]; + if (b.x === a.x) return direction === 'higher' ? Math.max(a.y, b.y) : Math.min(a.y, b.y); + const t = (x - a.x) / (b.x - a.x); + return a.y + t * (b.y - a.y); +} + +/** + * Trapezoidal AUC under the linearly-interpolated frontier between [lo, hi]. + * + * Out-of-range semantics depend on direction: + * - 'higher': outside the frontier's x-range y is treated as 0 (worst case + * for higher-is-better — a config that doesn't reach that interactivity + * contributes 0). Matches the original behavior / spec. + * - 'lower': integrate ONLY over each config's reachable x-range. Treating + * out-of-range as 0 would inflate AUC because 0 is the BEST value for + * cost / J / power metrics — that's the opposite of what we want. Using + * "worst observed value" outside the range would penalize configs with + * short reachable spans more than necessary; restricting integration to + * the reachable window is the simplest interpretable choice and matches + * the natural reading "average value over what the config can actually + * do, scaled by the span it covers". Consumers should display the + * effective window so smaller-coverage configs can be spotted. + * + * Closed-form rather than 10 001-sample grid — same answer to machine + * precision because the integrand is piecewise-linear, and avoids allocating + * arrays on every render. + */ +export function aucUnderFrontier( + frontier: readonly Point2D[], + lo: number, + hi: number, + direction: ParetoDirection = 'higher', +): number { + const last = frontier.at(-1); + if (frontier.length === 0 || !last || hi <= lo) return 0; + const minX = frontier[0].x; + const maxX = last.x; + const effLo = Math.max(lo, minX); + const effHi = Math.min(hi, maxX); + if (effHi <= effLo) return 0; + + if (direction === 'higher') { + // Build the integration breakpoints: clip the frontier vertices to + // [effLo, effHi] and add the boundaries. Outside the frontier's x-range + // we want y=0; that's already handled because the integration range is + // clipped to [effLo, effHi] (a strict sub-range of the frontier span). + // The original [lo, hi] outside-frontier region contributes 0 because + // the integrand is 0 there. + const xs: number[] = [effLo]; + for (const p of frontier) { + if (p.x > effLo && p.x < effHi) xs.push(p.x); + } + xs.push(effHi); + + let area = 0; + for (let i = 0; i < xs.length - 1; i++) { + const x0 = xs[i]; + const x1 = xs[i + 1]; + const y0 = interpAlongFrontier(frontier, x0, direction) ?? 0; + const y1 = interpAlongFrontier(frontier, x1, direction) ?? 0; + area += ((y0 + y1) / 2) * (x1 - x0); + } + return area; + } + + // direction === 'lower': integrate only over the reachable x-range. No + // padding outside [minX, maxX]; the effective window IS [effLo, effHi]. + const xs: number[] = [effLo]; + for (const p of frontier) { + if (p.x > effLo && p.x < effHi) xs.push(p.x); + } + xs.push(effHi); + + let area = 0; + for (let i = 0; i < xs.length - 1; i++) { + const x0 = xs[i]; + const x1 = xs[i + 1]; + const y0 = interpAlongFrontier(frontier, x0, direction) ?? 0; + const y1 = interpAlongFrontier(frontier, x1, direction) ?? 0; + area += ((y0 + y1) / 2) * (x1 - x0); + } + return area; +} + +/** + * Effective AUC integration window for a single frontier given a requested + * [lo, hi]. For 'higher' the window is always [lo, hi] (zero-pad outside). + * For 'lower' the window is clipped to the frontier's reachable span so + * callers can label which range was actually integrated. + */ +export function aucWindow( + frontier: readonly Point2D[], + lo: number, + hi: number, + direction: ParetoDirection = 'higher', +): { lo: number; hi: number } | null { + const last = frontier.at(-1); + if (frontier.length === 0 || !last || hi <= lo) return null; + if (direction === 'higher') return { lo, hi }; + const minX = frontier[0].x; + const maxX = last.x; + const effLo = Math.max(lo, minX); + const effHi = Math.min(hi, maxX); + if (effHi <= effLo) return null; + return { lo: effLo, hi: effHi }; +}