diff --git a/packages/app/cypress/e2e/reproduce-drawer.cy.ts b/packages/app/cypress/e2e/reproduce-drawer.cy.ts new file mode 100644 index 00000000..1dc40132 --- /dev/null +++ b/packages/app/cypress/e2e/reproduce-drawer.cy.ts @@ -0,0 +1,125 @@ +/** + * Tests for the Reproduce drawer — opens from the inference table row, + * scatter pinned tooltip, and GPU graph tooltip. Verifies drawer state is + * URL-safe (closing does not perturb chart zoom or query string). + */ +describe('Reproduce drawer', () => { + beforeEach(() => { + cy.window().then((win) => { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }); + cy.visit('/inference'); + cy.get('[data-testid="scatter-graph"]') + .first() + .find('svg .dot-group') + .should('have.length.greaterThan', 0); + }); + + it('opens from clicking an inference table row and shows the three tabs', () => { + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.get('[data-testid="inference-results-table"]').should('be.visible'); + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + + cy.get('[data-testid="reproduce-drawer"]').should('be.visible'); + cy.contains('Reproduce this benchmark').should('be.visible'); + cy.contains('button', 'Command').should('be.visible'); + cy.contains('button', 'Config JSON').should('be.visible'); + cy.contains('button', 'Environment').should('be.visible'); + }); + + it('exposes a copy button on every tab', () => { + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + cy.get('[data-testid="reproduce-drawer-copy"]').should('be.visible'); + cy.contains('button', 'Config JSON').click(); + cy.get('[data-testid="reproduce-drawer-copy"]').should('be.visible'); + cy.contains('button', 'Environment').click(); + cy.get('[data-testid="reproduce-drawer-copy"]').should('be.visible'); + }); + + it('Config JSON tab shows config fields and excludes result metrics', () => { + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + cy.contains('button', 'Config JSON').click(); + cy.get('[data-testid="reproduce-drawer"]') + .find('pre') + .first() + .invoke('text') + .then((text) => { + // Config / identity fields belong here. + expect(text).to.match(/"framework":/u); + expect(text).to.match(/"precision":/u); + expect(text).to.match(/"tp":/u); + // Raw result metrics from `benchmark_results.metrics` must NOT leak in. + expect(text).not.to.match(/"mean_ttft":/u); + expect(text).not.to.match(/"p99_e2el":/u); + expect(text).not.to.match(/"tput_per_gpu":/u); + }); + }); + + it('Environment tab renders structured rows including env-only fields with graceful fallback', () => { + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + cy.contains('button', 'Environment').click(); + // Core rows are always rendered. The values come from /api/v1/run-environment + // when available; otherwise they show "(not recorded)" — we assert the + // labels exist either way so a regression that drops a row is caught. + const labels = [ + 'GPU', + 'GPU SKU', + 'Framework', + 'Framework version', + 'Framework SHA', + 'Container image', + 'Driver', + 'CUDA', + 'PyTorch', + 'Python', + ]; + for (const label of labels) { + cy.get('[data-testid="reproduce-drawer"]').contains('dt', label).should('be.visible'); + } + }); + + it('Esc closes the drawer without changing the URL hash', () => { + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.url().then((before) => { + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + cy.get('[data-testid="reproduce-drawer"]').should('be.visible'); + cy.get('body').type('{esc}'); + cy.get('[data-testid="reproduce-drawer"]').should('not.exist'); + cy.url().should('eq', before); + }); + }); + + it('renders correctly for an unofficial-run overlay row when one is loaded', () => { + // Re-visit with the overlay query param. We do NOT assert which row is + // rendered — we only assert the drawer can be opened from whatever points + // appear for the official path on top of the overlay. The wiring is the + // same code path: clicking an inference table row feeds the InferenceData + // through to the drawer regardless of where the row originated. + const candidateRunId = '15000000000'; + cy.visit(`/inference?unofficialrun=${candidateRunId}`); + cy.get('[data-testid="scatter-graph"]') + .first() + .find('svg .dot-group') + .should('have.length.greaterThan', 0); + cy.get('[data-testid="inference-table-view-btn"]').first().click(); + cy.get('[data-testid="inference-results-table"]').should('be.visible'); + cy.get('[data-testid="inference-results-table"] tbody tr').first().click(); + cy.get('[data-testid="reproduce-drawer"]').should('be.visible'); + // Same Config JSON guarantee for the overlay path — the drawer renders + // overlay points through the same `InferenceData` shape, so result-metric + // leakage would silently regress there too if we didn't assert it. + cy.contains('button', 'Config JSON').click(); + cy.get('[data-testid="reproduce-drawer"]') + .find('pre') + .first() + .invoke('text') + .then((text) => { + expect(text).to.match(/"framework":/u); + expect(text).not.to.match(/"mean_ttft":/u); + expect(text).not.to.match(/"tput_per_gpu":/u); + }); + }); +}); diff --git a/packages/app/cypress/fixtures/api/run-environment.json b/packages/app/cypress/fixtures/api/run-environment.json new file mode 100644 index 00000000..87e5ab64 --- /dev/null +++ b/packages/app/cypress/fixtures/api/run-environment.json @@ -0,0 +1,17 @@ +{ + "workflow_run_id": 1, + "config_id": 1, + "environment": { + "source": "log_parse", + "image": null, + "framework_version": null, + "framework_sha": null, + "torch_version": null, + "python_version": null, + "cuda_version": null, + "rocm_version": null, + "driver_version": null, + "gpu_sku": null, + "extra": {} + } +} diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index 0defa033..6c61dad9 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -250,6 +250,9 @@ export function createMockInferenceContext( activePresetId: null, setActivePresetId: namedStub('setActivePresetId'), presetGuardRef: { current: false } as React.RefObject, + reproducePoint: null, + openReproduceDrawer: namedStub('openReproduceDrawer'), + closeReproduceDrawer: namedStub('closeReproduceDrawer'), ...overrides, }; } diff --git a/packages/app/scripts/capture-cypress-fixtures.ts b/packages/app/scripts/capture-cypress-fixtures.ts index 5f149289..74684642 100644 --- a/packages/app/scripts/capture-cypress-fixtures.ts +++ b/packages/app/scripts/capture-cypress-fixtures.ts @@ -3,7 +3,8 @@ * * Hits the public production API by default and writes one JSON file per * endpoint into cypress/fixtures/api/. The cypress e2e suite uses these - * fixtures via cy.intercept so tests run with no database. + * fixtures via server-side `FIXTURES_MODE` (E2E_FIXTURES=1) so tests run + * with no database. * * Usage: * pnpm --filter app capture:fixtures (prod) @@ -154,6 +155,11 @@ async function main() { precision: string; isl: number; osl: number; + // Optional: only present after the env-key PR ships. The capture script + // uses these to fetch a representative `/api/v1/run-environment` response; + // the route uses them as its sole identifier. + workflow_run_id?: number; + config_id?: number; } const benchmarks = await fetchJson( `/api/v1/benchmarks?model=${encodeURIComponent(BENCHMARK_MODEL)}`, @@ -188,6 +194,51 @@ async function main() { `/api/v1/workflow-info?date=${encodeURIComponent(latestDate)}`, ); + // run-environment: fired by `useRunEnvironment` every time the Reproduce + // drawer opens. We need a fixture so cypress' fixture mode doesn't 500. + // Try to pull a real one from prod, falling back to an all-nulls / + // log_parse placeholder. The placeholder is the worst-case end-state the + // drawer is designed to render (every env-only field shows "(not + // recorded)" with the "Some fields are approximated…" hint), so it's + // production-realistic even before the upstream env.json artifact lands. + const RUN_ENV_PLACEHOLDER = { + workflow_run_id: 1, + config_id: 1, + environment: { + source: 'log_parse', + image: null, + framework_version: null, + framework_sha: null, + torch_version: null, + python_version: null, + cuda_version: null, + rocm_version: null, + driver_version: null, + gpu_sku: null, + extra: {}, + }, + }; + let runEnvironment: unknown = RUN_ENV_PLACEHOLDER; + const sampleRow = benchmarks.find((b) => b.workflow_run_id && b.config_id); + if (sampleRow) { + const envUrl = + `${baseUrl}/api/v1/run-environment` + + `?workflow_run_id=${sampleRow.workflow_run_id}` + + `&config_id=${sampleRow.config_id}`; + try { + const res = await fetch(envUrl); + if (res.ok) runEnvironment = await res.json(); + } catch { + // Network or parse failure — keep the placeholder; logged below. + } + } + if (runEnvironment === RUN_ENV_PLACEHOLDER) { + console.log( + ' (note) run-environment: using placeholder — either prod predates the env PR, ' + + 'the benchmark_environments table is empty, or the route is unavailable.', + ); + } + const N = TOP_DATES_PER_PARTITION; const sizes: [string, number][] = [ [ @@ -250,6 +301,7 @@ async function main() { }), ], ['workflow-info', await writeFixture('workflow-info', workflowInfo)], + ['run-environment', await writeFixture('run-environment', runEnvironment)], ]; for (const [name, bytes] of sizes) { diff --git a/packages/app/src/app/api/v1/run-environment/route.test.ts b/packages/app/src/app/api/v1/run-environment/route.test.ts new file mode 100644 index 00000000..795343f9 --- /dev/null +++ b/packages/app/src/app/api/v1/run-environment/route.test.ts @@ -0,0 +1,162 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +const { mockGetEnvironment, mockGetDb } = vi.hoisted(() => ({ + mockGetEnvironment: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); + +vi.mock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: false, +})); + +vi.mock('@semianalysisai/inferencex-db/queries/environments', () => ({ + getEnvironmentForRunConfig: mockGetEnvironment, +})); + +vi.mock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), +})); + +import { GET } from './route'; +import { NextRequest } from 'next/server'; + +function req(url: string): NextRequest { + return new NextRequest(new URL(url, 'http://localhost')); +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +const env = { + source: 'env_json' as const, + image: 'lmsysorg/sglang:latest', + framework_version: '0.4.3.post2', + framework_sha: 'e136d70cdc6101007017c05d57fb4cec5d6ed98f', + torch_version: '2.5.1+cu124', + python_version: '3.12.7', + cuda_version: '12.4', + rocm_version: null, + driver_version: '560.35.03', + gpu_sku: 'NVIDIA H100 80GB HBM3', + extra: {}, +}; + +const VALID_QS = 'workflow_run_id=101&config_id=42'; + +describe('GET /api/v1/run-environment', () => { + it('returns 400 when workflow_run_id is missing', async () => { + const res = await GET(req('/api/v1/run-environment?config_id=42')); + expect(res.status).toBe(400); + }); + + it('returns 400 when config_id is missing', async () => { + const res = await GET(req('/api/v1/run-environment?workflow_run_id=101')); + expect(res.status).toBe(400); + }); + + it('returns 400 when params are non-numeric', async () => { + const res = await GET(req('/api/v1/run-environment?workflow_run_id=abc&config_id=xyz')); + expect(res.status).toBe(400); + }); + + it('returns 404 when no environment row exists', async () => { + mockGetEnvironment.mockResolvedValueOnce(null); + const res = await GET(req(`/api/v1/run-environment?${VALID_QS}`)); + expect(res.status).toBe(404); + }); + + it('returns env_json environment for valid (workflow_run_id, config_id)', async () => { + mockGetEnvironment.mockResolvedValueOnce(env); + const res = await GET(req(`/api/v1/run-environment?${VALID_QS}`)); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body).toEqual({ workflow_run_id: 101, config_id: 42, environment: env }); + expect(mockGetEnvironment).toHaveBeenCalledWith('mock-sql', 101, 42); + }); + + it('returns log_parse environment with nulls preserved', async () => { + mockGetEnvironment.mockResolvedValueOnce({ + ...env, + source: 'log_parse', + framework_sha: null, + driver_version: null, + cuda_version: null, + gpu_sku: null, + }); + const res = await GET(req(`/api/v1/run-environment?${VALID_QS}`)); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.environment.source).toBe('log_parse'); + expect(body.environment.framework_sha).toBeNull(); + expect(body.environment.driver_version).toBeNull(); + }); + + it('returns 500 when query throws', async () => { + mockGetEnvironment.mockRejectedValueOnce(new Error('Connection reset')); + const res = await GET(req(`/api/v1/run-environment?${VALID_QS}`)); + expect(res.status).toBe(500); + }); +}); + +// Separate suite because FIXTURES_MODE is read at module-eval time — the only +// way to flip it for a single test is to reset module cache + dynamic-import. +describe('GET /api/v1/run-environment (FIXTURES_MODE)', () => { + it('short-circuits to the loaded fixture and never hits the env query', async () => { + vi.resetModules(); + const mockLoadFixture = vi.fn(() => ({ + workflow_run_id: 1, + config_id: 1, + environment: { ...env, source: 'log_parse' }, + })); + vi.doMock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: true, + })); + vi.doMock('@semianalysisai/inferencex-db/queries/environments', () => ({ + getEnvironmentForRunConfig: mockGetEnvironment, + })); + vi.doMock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), + })); + vi.doMock('@/lib/test-fixtures', () => ({ loadFixture: mockLoadFixture })); + + const { GET: GETwithFixtures } = await import('./route'); + const res = await GETwithFixtures(req(`/api/v1/run-environment?${VALID_QS}`)); + + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.environment.source).toBe('log_parse'); + expect(mockLoadFixture).toHaveBeenCalledWith('run-environment'); + expect(mockGetEnvironment).not.toHaveBeenCalled(); + }); + + it('still 400s on missing params before consulting the fixture', async () => { + vi.resetModules(); + const mockLoadFixture = vi.fn(); + vi.doMock('@semianalysisai/inferencex-db/connection', () => ({ + getDb: mockGetDb, + JSON_MODE: false, + FIXTURES_MODE: true, + })); + vi.doMock('@semianalysisai/inferencex-db/queries/environments', () => ({ + getEnvironmentForRunConfig: mockGetEnvironment, + })); + vi.doMock('@/lib/api-cache', () => ({ + cachedQuery: (fn: (...args: any[]) => any) => fn, + cachedJson: (data: unknown) => Response.json(data), + })); + vi.doMock('@/lib/test-fixtures', () => ({ loadFixture: mockLoadFixture })); + + const { GET: GETwithFixtures } = await import('./route'); + const res = await GETwithFixtures(req('/api/v1/run-environment?config_id=42')); + + expect(res.status).toBe(400); + expect(mockLoadFixture).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/app/src/app/api/v1/run-environment/route.ts b/packages/app/src/app/api/v1/run-environment/route.ts new file mode 100644 index 00000000..029cb873 --- /dev/null +++ b/packages/app/src/app/api/v1/run-environment/route.ts @@ -0,0 +1,53 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; +import { getEnvironmentForRunConfig } from '@semianalysisai/inferencex-db/queries/environments'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; +import { loadFixture } from '@/lib/test-fixtures'; + +export const dynamic = 'force-dynamic'; + +const getCachedEnvironment = cachedQuery( + (workflowRunId: number, configId: number) => { + if (JSON_MODE) { + return Promise.resolve(jsonProvider.getEnvironmentForRunConfig(workflowRunId, configId)); + } + return getEnvironmentForRunConfig(getDb(), workflowRunId, configId); + }, + 'run-environment', + { blobOnly: true }, +); + +export async function GET(request: NextRequest) { + const params = request.nextUrl.searchParams; + const workflowRunId = Number(params.get('workflow_run_id')); + const configId = Number(params.get('config_id')); + + if (!workflowRunId || !Number.isFinite(workflowRunId)) { + return NextResponse.json( + { error: 'workflow_run_id is required (positive integer)' }, + { status: 400 }, + ); + } + if (!configId || !Number.isFinite(configId)) { + return NextResponse.json( + { error: 'config_id is required (positive integer)' }, + { status: 400 }, + ); + } + + if (FIXTURES_MODE) return cachedJson(loadFixture('run-environment')); + + try { + const env = await getCachedEnvironment(workflowRunId, configId); + if (env === null) { + return NextResponse.json({ error: 'Not found' }, { status: 404 }); + } + return cachedJson({ workflow_run_id: workflowRunId, config_id: configId, environment: env }); + } catch (error) { + console.error('Error fetching benchmark environment:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 3011fc24..aaef9a6d 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -146,6 +146,23 @@ export function InferenceProvider({ // --- Tracked configs state --- const [trackedConfigs, setTrackedConfigs] = useState([]); + // --- Reproduce drawer state --- + // Local-only — we do NOT sync this to the URL because closing the drawer + // should not perturb chart zoom or share-link state. + const [reproducePoint, setReproducePoint] = useState(null); + const openReproduceDrawer = useCallback((point: InferenceData, source: string) => { + setReproducePoint(point); + track('reproduce_drawer_open_clicked', { + source, + framework: point.framework, + hwKey: point.hwKey, + precision: point.precision, + tp: point.tp, + conc: point.conc, + }); + }, []); + const closeReproduceDrawer = useCallback(() => setReproducePoint(null), []); + // --- Favorite presets state --- const [pendingHwFilter, setPendingHwFilter] = useState(null); const [activePresetId, setActivePresetId] = useState(null); @@ -984,6 +1001,9 @@ export function InferenceProvider({ activePresetId, setActivePresetId, presetGuardRef, + reproducePoint, + openReproduceDrawer, + closeReproduceDrawer, }), [ activeHwTypes, @@ -1037,6 +1057,9 @@ export function InferenceProvider({ removeTrackedConfig, clearTrackedConfigs, activePresetId, + reproducePoint, + openReproduceDrawer, + closeReproduceDrawer, ], ); diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a0e9232d..3ea703ae 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -88,6 +88,14 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** + * Natural-key halves for the Reproduce Drawer's Environment tab — together + * they key `/api/v1/run-environment`. Both are optional because client-only + * synthetic points (e.g. overlay rooflines, unofficial-run rows) don't + * originate from a DB row. + */ + workflowRunId?: number; + configId?: number; } /** @@ -546,6 +554,12 @@ export interface InferenceChartContextType { activePresetId: string | null; setActivePresetId: (id: string | null) => void; presetGuardRef: React.RefObject; + /** The point currently shown in the Reproduce drawer, or null when closed. */ + reproducePoint: InferenceData | null; + /** Open the Reproduce drawer for a given chart point. */ + openReproduceDrawer: (point: InferenceData, source: string) => void; + /** Close the Reproduce drawer. */ + closeReproduceDrawer: () => void; } export interface CalculateUserCostsRequest { model: string; diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 68f46809..13d0451e 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -49,8 +49,11 @@ import CustomCosts from './CustomCosts'; import CustomPowers from './CustomPowers'; import GPUGraph from './GPUGraph'; import ReplayLauncher, { type ReplayLauncherHandle } from '../replay/ReplayLauncher'; +import ReproduceDrawer from './ReproduceDrawer'; import TrendChart from './TrendChart'; +import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; + const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagram'), { ssr: false, loading: () => , @@ -150,8 +153,15 @@ export default function ChartDisplay() { activeHwTypes, activeDates, setSelectedE2eXAxisMetric, + reproducePoint, + closeReproduceDrawer, } = useInference(); + const reproduceSequence = useMemo( + () => (selectedSequence ? sequenceToIslOsl(selectedSequence) : null) ?? undefined, + [selectedSequence], + ); + const { changelogs, loading: changelogsLoading, @@ -707,6 +717,13 @@ export default function ChartDisplay() { + + ); } diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index 407e7256..f4ceca57 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -72,6 +72,7 @@ const GPUGraph = React.memo( selectAllActiveDates, showLineLabels, setShowLineLabels, + openReproduceDrawer, } = useInference(); const { resolvedTheme } = useTheme(); const chartRef = useRef(null); @@ -691,6 +692,18 @@ const GPUGraph = React.memo( sel.select('.visible-shape') as any, getShapeKeyForPrecision(d.precision, selectedPrecisions), ), + onPointClick: (d: InferenceData) => { + const tooltipEl = chartRef.current?.getTooltipElement(); + if (!tooltipEl) return; + const reproduceBtn = tooltipEl.querySelector('[data-action="reproduce"]'); + if (!reproduceBtn) return; + reproduceBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + openReproduceDrawer(d, 'gpu_graph_tooltip'); + chartRef.current?.dismissTooltip(); + chartRef.current?.hideTooltip(); + }); + }, attachToLayer: 1, }} onRender={(ctx: RenderContext) => { diff --git a/packages/app/src/components/inference/ui/InferenceTable.tsx b/packages/app/src/components/inference/ui/InferenceTable.tsx index c300e60d..3f01b70e 100644 --- a/packages/app/src/components/inference/ui/InferenceTable.tsx +++ b/packages/app/src/components/inference/ui/InferenceTable.tsx @@ -2,8 +2,10 @@ import { useMemo } from 'react'; +import { useInference } from '@/components/inference/InferenceContext'; import type { ChartDefinition, InferenceData } from '@/components/inference/types'; import { type DataTableColumn, DataTable } from '@/components/ui/data-table'; +import { track } from '@/lib/analytics'; import { getHardwareConfig } from '@/lib/constants'; import { getNestedYValue } from '@/lib/chart-utils'; import { type Precision, getPrecisionLabel } from '@/lib/data-mappings'; @@ -29,6 +31,7 @@ export default function InferenceTable({ chartDefinition, selectedYAxisMetric, }: InferenceTableProps) { + const { openReproduceDrawer } = useInference(); const yPath = chartDefinition[selectedYAxisMetric as keyof ChartDefinition] as string | undefined; const yLabel = chartDefinition[`${selectedYAxisMetric}_label` as keyof ChartDefinition] as string; const xLabel = chartDefinition.x_label; @@ -120,6 +123,16 @@ export default function InferenceTable({ columns={columns} testId="inference-results-table" analyticsPrefix="inference_table" + onRowClick={(row) => { + track('inference_table_reproduce_clicked', { + framework: row.framework, + hwKey: row.hwKey, + precision: row.precision, + tp: row.tp, + conc: row.conc, + }); + openReproduceDrawer(row, 'inference_table'); + }} /> ); } diff --git a/packages/app/src/components/inference/ui/ReproduceDrawer.tsx b/packages/app/src/components/inference/ui/ReproduceDrawer.tsx new file mode 100644 index 00000000..9d6e6f99 --- /dev/null +++ b/packages/app/src/components/inference/ui/ReproduceDrawer.tsx @@ -0,0 +1,428 @@ +'use client'; + +import { useEffect, useMemo, useState } from 'react'; +import { Check, Copy, ExternalLink } from 'lucide-react'; + +import type { InferenceData } from '@/components/inference/types'; +import { Dialog, DialogContent, DialogTitle } from '@/components/ui/dialog'; +import { useRunEnvironment } from '@/hooks/api/use-run-environment'; +import type { BenchmarkEnvironment } from '@/lib/api'; +import { track } from '@/lib/analytics'; +import { getHardwareConfig } from '@/lib/constants'; +import { buildLaunchCommand } from '@/lib/reproduce-command'; +import { buildReproduceConfig } from '@/lib/reproduce-config'; +import { getDisplayLabel, updateRepoUrl } from '@/lib/utils'; + +type DrawerTab = 'command' | 'config' | 'environment'; + +interface ReproduceDrawerProps { + /** The point to reproduce, or null when the drawer is closed. */ + point: InferenceData | null; + /** ISL/OSL of the active sequence, used for command-line generation. */ + sequence?: { isl: number; osl: number }; + /** Selected model display key, passed through to launch-command generation. */ + model?: string; + onClose: () => void; +} + +/** + * Drawer that explains how to reproduce a benchmark point: launch command, + * full config JSON, environment (image, framework SHA, GPU SKU, run URL). + * + * Exits on Esc and outside-click without disturbing chart zoom or URL state — + * the only state that lives outside this component is the selected `point`, + * which the caller wipes via `onClose`. + */ +export default function ReproduceDrawer({ point, sequence, model, onClose }: ReproduceDrawerProps) { + const open = point !== null; + const [activeTab, setActiveTab] = useState('command'); + + useEffect(() => { + if (!open) return; + setActiveTab('command'); + }, [point?.hwKey, point?.tp, point?.conc, point?.precision, open]); + + const launch = useMemo(() => { + if (!point) return null; + return buildLaunchCommand(point.framework ?? '', { + model, + precision: point.precision, + tp: point.tp, + ep: point.ep, + dp_attention: point.dp_attention, + spec_decoding: point.spec_decoding, + disagg: point.disagg, + prefill_tp: point.prefill_tp, + prefill_ep: point.prefill_ep, + prefill_dp_attention: point.prefill_dp_attention, + prefill_num_workers: point.prefill_num_workers, + num_prefill_gpu: point.num_prefill_gpu, + decode_tp: point.decode_tp, + decode_ep: point.decode_ep, + decode_dp_attention: point.decode_dp_attention, + decode_num_workers: point.decode_num_workers, + num_decode_gpu: point.num_decode_gpu, + conc: point.conc, + isl: sequence?.isl, + osl: sequence?.osl, + image: point.image, + }); + }, [point, sequence?.isl, sequence?.osl, model]); + + const configJson = useMemo( + () => (point ? JSON.stringify(buildReproduceConfig(point, sequence), null, 2) : ''), + [point, sequence], + ); + + const runUrl = point?.run_url ? updateRepoUrl(point.run_url) : undefined; + const hwLabel = useMemo(() => { + if (!point) return ''; + try { + return getDisplayLabel(getHardwareConfig(point.hwKey)); + } catch { + return point.hwKey; + } + }, [point]); + + // Fire drawer-open analytics once per opened point. + useEffect(() => { + if (!point) return; + track('reproduce_drawer_opened', { + framework: point.framework, + hwKey: point.hwKey, + precision: point.precision, + tp: point.tp, + conc: point.conc, + disagg: Boolean(point.disagg), + }); + }, [point]); + + // Fetch authoritative env metadata (driver / CUDA / framework SHA / etc.) + // for this benchmark row, keyed by (workflow_run_id, config_id) — the + // natural key of `benchmark_environments`. The hook is a no-op when + // either id is missing (e.g. synthetic overlay points), and the UI + // degrades to the point-derived fields below. + const envQuery = useRunEnvironment(point?.workflowRunId, point?.configId); + const env = envQuery.data?.environment; + + const copyTextForActiveTab = (): string => { + if (!point) return ''; + if (activeTab === 'config') return configJson; + if (activeTab === 'environment') { + return buildEnvironmentCopyText({ point, hwLabel, runUrl, env }); + } + if (!launch) return ''; + if (launch.kind === 'single' && launch.command) return launch.command; + if (launch.kind === 'disagg' && launch.commands) { + return launch.commands.map((c) => `# ${c.label}\n${c.command}`).join('\n\n'); + } + return launch.fallbackReason ?? ''; + }; + + const handleCopy = async () => { + if (!point) return; + const text = copyTextForActiveTab(); + if (!text) return; + try { + await navigator.clipboard.writeText(text); + } catch { + // Clipboard can be unavailable in non-secure contexts; tracking still useful. + } + track('reproduce_copy', { + tab: activeTab, + framework: point.framework, + hwKey: point.hwKey, + precision: point.precision, + tp: point.tp, + conc: point.conc, + }); + }; + + return ( + { + if (!o) onClose(); + }} + > + +
+
+ Reproduce this benchmark + {point && ( +
+ {hwLabel} + {' · '} + TP{point.tp} + {' · '} + conc {point.conc} + {' · '} + {point.precision} + {point.disagg && · disagg} +
+ )} +
+
+ +
+ setActiveTab('command')} + /> + setActiveTab('config')} + /> + setActiveTab('environment')} + /> + +
+ +
+ {point ? ( + activeTab === 'command' ? ( + + ) : activeTab === 'config' ? ( + + ) : ( + + ) + ) : null} +
+
+
+ ); +} + +interface TabButtonProps { + label: string; + active: boolean; + onClick: () => void; +} + +function TabButton({ label, active, onClick }: TabButtonProps) { + return ( + + ); +} + +function CopyButton({ onCopy, testId }: { onCopy: () => void | Promise; testId?: string }) { + const [copied, setCopied] = useState(false); + return ( + + ); +} + +function CommandTab({ launch }: { launch: ReturnType | null }) { + if (!launch) return null; + if (launch.kind === 'fallback') { + return ( +
+

No launch command

+

{launch.fallbackReason}

+
+ ); + } + if (launch.kind === 'single' && launch.command) { + return ; + } + if (launch.kind === 'disagg' && launch.commands) { + return ( +
+ {launch.commands.map((cmd) => ( +
+
+ {cmd.label} +
+ +
+ ))} +
+ ); + } + return null; +} + +/** + * Build the labeled rows for the Environment tab. Centralized so the + * rendered UI and the copy-to-clipboard output stay in sync — adding a new + * field means changing one place. + * + * `env` is the authoritative response from `/api/v1/run-environment`. When + * absent (loading, 404, or synthetic overlay point) the rows fall back to + * what we can derive from `point` alone. Rows whose value is `null` or + * `undefined` render as italic "(not recorded)". + */ +/** Pure helper — undefined ⟶ null so the render-time fallback handles both. */ +const fromEnv = (v: string | null | undefined) => v ?? null; + +function buildEnvironmentRows( + point: InferenceData, + hwLabel: string, + runUrl: string | undefined, + env: BenchmarkEnvironment | undefined, +): { label: string; value: string | null }[] { + return [ + { label: 'GPU', value: hwLabel || null }, + { label: 'GPU SKU', value: fromEnv(env?.gpu_sku) }, + { label: 'Framework', value: point.framework ?? null }, + { label: 'Framework version', value: fromEnv(env?.framework_version) }, + { label: 'Framework SHA', value: fromEnv(env?.framework_sha) }, + { + label: 'Precision', + value: point.precision ? point.precision.toUpperCase() : null, + }, + { + label: 'Speculative decoding', + value: point.spec_decoding && point.spec_decoding !== 'none' ? point.spec_decoding : 'none', + }, + { label: 'Container image', value: env?.image ?? point.image ?? null }, + { label: 'Driver', value: fromEnv(env?.driver_version) }, + { label: 'CUDA', value: fromEnv(env?.cuda_version) }, + { label: 'ROCm', value: fromEnv(env?.rocm_version) }, + { label: 'PyTorch', value: fromEnv(env?.torch_version) }, + { label: 'Python', value: fromEnv(env?.python_version) }, + { label: 'Run date', value: point.actualDate ?? point.date ?? null }, + { label: 'Workflow run', value: runUrl ?? null }, + ]; +} + +function buildEnvironmentCopyText(args: { + point: InferenceData; + hwLabel: string; + runUrl: string | undefined; + env: BenchmarkEnvironment | undefined; +}): string { + return buildEnvironmentRows(args.point, args.hwLabel, args.runUrl, args.env) + .filter((r) => r.value !== null) + .map((r) => `${r.label}: ${r.value}`) + .join('\n'); +} + +function EnvironmentTab({ + point, + hwLabel, + runUrl, + env, + isLoading, +}: { + point: InferenceData; + hwLabel: string; + runUrl?: string; + env: BenchmarkEnvironment | undefined; + isLoading: boolean; +}) { + const rows = buildEnvironmentRows(point, hwLabel, runUrl, env); + return ( +
+ {env?.source === 'log_parse' && ( +

+ Some fields are approximated from the server log; consult the run URL for the + authoritative environment. +

+ )} + {isLoading && !env ? ( +

Loading environment…

+ ) : null} +
+ {rows.map(({ label, value }) => ( +
+
{label}
+
+ {value || (not recorded)} +
+
+ ))} +
+
+ ); +} + +function CodeBlock({ value, language: _language }: { value: string; language: 'bash' | 'json' }) { + return ( +
+      {value}
+    
+ ); +} diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index f9a73aa8..15bb7a08 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -150,6 +150,7 @@ const ScatterGraph = React.memo( trackedConfigs, addTrackedConfig, removeTrackedConfig, + openReproduceDrawer, } = useInference(); const { @@ -679,6 +680,15 @@ const ScatterGraph = React.memo( }); }); } + const reproduceBtn = tooltipEl.querySelector('[data-action="reproduce"]'); + if (reproduceBtn) { + reproduceBtn.addEventListener('click', (btnEvent) => { + btnEvent.stopPropagation(); + openReproduceDrawer(d, 'scatter_tooltip'); + chartRef.current?.dismissTooltip(); + chartRef.current?.hideTooltip(); + }); + } } }, attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0) @@ -693,6 +703,7 @@ const ScatterGraph = React.memo( removeTrackedConfig, chartDefinition.chartType, selectedPrecisions, + openReproduceDrawer, ], ); diff --git a/packages/app/src/components/inference/utils/tooltip-utils.test.ts b/packages/app/src/components/inference/utils/tooltip-utils.test.ts index bb8caafa..596a6ba4 100644 --- a/packages/app/src/components/inference/utils/tooltip-utils.test.ts +++ b/packages/app/src/components/inference/utils/tooltip-utils.test.ts @@ -264,6 +264,17 @@ describe('generateTooltipContent', () => { expect(html).toContain('Track Over Time'); expect(html).not.toContain('Untrack Over Time'); }); + + it('shows the Reproduce button when pinned', () => { + const html = generateTooltipContent(tooltipConfig({ isPinned: true })); + expect(html).toContain('data-action="reproduce"'); + expect(html).toContain('Reproduce'); + }); + + it('does not show the Reproduce button when not pinned', () => { + const html = generateTooltipContent(tooltipConfig({ isPinned: false })); + expect(html).not.toContain('data-action="reproduce"'); + }); }); // =========================================================================== @@ -365,4 +376,14 @@ describe('generateGPUGraphTooltipContent', () => { ); expect(html).toContain('vllm-v0.6.0
abc123'); }); + + it('shows the Reproduce button when pinned', () => { + const html = generateGPUGraphTooltipContent(tooltipConfig({ isPinned: true })); + expect(html).toContain('data-action="reproduce"'); + }); + + it('does not show the Reproduce button when not pinned', () => { + const html = generateGPUGraphTooltipContent(tooltipConfig({ isPinned: false })); + expect(html).not.toContain('data-action="reproduce"'); + }); }); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 4c56d217..6ba23b8f 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -192,7 +192,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => { margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500; border: 1px solid var(--border); border-radius: 6px; cursor: pointer; background: var(--accent); color: var(--accent-foreground); - ">${config.isTracked ? 'Untrack Over Time' : 'Track Over Time'}` + ">${config.isTracked ? 'Untrack Over Time' : 'Track Over Time'} + ` : '' } @@ -301,6 +306,15 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string => Precision: ${d.precision.toUpperCase()} ${runLinkHTML(runUrl)} + ${ + isPinned + ? `` + : '' + } `; }; diff --git a/packages/app/src/components/ui/data-table.tsx b/packages/app/src/components/ui/data-table.tsx index e69135a4..05dbe164 100644 --- a/packages/app/src/components/ui/data-table.tsx +++ b/packages/app/src/components/ui/data-table.tsx @@ -51,6 +51,11 @@ interface DataTableProps { analyticsPrefix?: string; /** Show watermark (default: true). */ watermark?: boolean; + /** + * When set, the entire row is clickable and keyboard-activatable (Enter / + * Space), e.g. to open a detail drawer. + */ + onRowClick?: (row: T, index: number) => void; } const PAGE_SIZE_OPTIONS = [25, 50, 100, 250, 500] as const; @@ -73,6 +78,7 @@ export function DataTable({ testId = 'data-table', analyticsPrefix = 'table', watermark = true, + onRowClick, }: DataTableProps) { const [page, setPage] = useState(0); const [pageSize, setPageSize] = useState(25); @@ -236,7 +242,28 @@ export function DataTable({ ) : ( pageData.map((row, rowIndex) => ( - + onRowClick(row, safePage * pageSize + rowIndex) : undefined + } + onKeyDown={ + onRowClick + ? (e) => { + if (e.key === 'Enter' || e.key === ' ') { + e.preventDefault(); + onRowClick(row, safePage * pageSize + rowIndex); + } + } + : undefined + } + > {columns.map((col, colIndex) => ( ({ + queryKey: ['run-environment', workflowRunId, configId] as const, + queryFn: ({ signal }) => + fetchRunEnvironment(workflowRunId as number, configId as number, signal), + enabled, + // Env data for a given (run, config) never changes once written — + // long stale time avoids refetches on drawer reopen. + staleTime: 60 * 60 * 1000, + }); +} diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 999cbfde..57cdcf62 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -6,6 +6,14 @@ import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { + /** + * Natural-key halves identifying the (run, config) this row was produced + * by. Together they key `benchmark_environments` and any future + * per-(run, config) endpoint. Optional because unofficial-run rows + * (synthesized client-side from GHA artifacts) have no DB row. + */ + workflow_run_id?: number; + config_id?: number; hardware: string; framework: string; model: string; @@ -271,3 +279,33 @@ export interface FeedbackListRow { export function fetchFeedbackList(signal?: AbortSignal) { return fetchJson<{ rows: FeedbackListRow[] }>('/api/v1/feedback/list', signal); } + +/** Environment metadata for a single benchmark row, served by /api/v1/run-environment. */ +export interface BenchmarkEnvironment { + /** Provenance: 'env_json' = authoritative, 'log_parse' = best-effort fallback. */ + source: 'env_json' | 'log_parse'; + image: string | null; + framework_version: string | null; + framework_sha: string | null; + torch_version: string | null; + python_version: string | null; + cuda_version: string | null; + rocm_version: string | null; + driver_version: string | null; + gpu_sku: string | null; + extra: Record; +} + +export interface RunEnvironmentResponse { + workflow_run_id: number; + config_id: number; + environment: BenchmarkEnvironment; +} + +export function fetchRunEnvironment(workflowRunId: number, configId: number, signal?: AbortSignal) { + const params = new URLSearchParams({ + workflow_run_id: String(workflowRunId), + config_id: String(configId), + }); + return fetchJson(`/api/v1/run-environment?${params}`, signal); +} diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index be76438e..00fed7b3 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -6,6 +6,8 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform function makeRow(overrides: Partial = {}): BenchmarkRow { return { + workflow_run_id: 1, + config_id: 1, hardware: 'h200', framework: 'trt', model: 'dsr1', diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 107f0b12..505a644b 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -68,6 +68,8 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + workflowRunId: row.workflow_run_id, + configId: row.config_id, }; } diff --git a/packages/app/src/lib/reproduce-command.test.ts b/packages/app/src/lib/reproduce-command.test.ts new file mode 100644 index 00000000..910a34bb --- /dev/null +++ b/packages/app/src/lib/reproduce-command.test.ts @@ -0,0 +1,243 @@ +import { describe, expect, it } from 'vitest'; + +import { buildLaunchCommand } from './reproduce-command'; + +describe('buildLaunchCommand', () => { + describe('vllm', () => { + it('builds a basic single-process command', () => { + const result = buildLaunchCommand('vllm', { + model: 'deepseek-ai/DeepSeek-R1', + precision: 'fp8', + tp: 8, + conc: 64, + isl: 1024, + osl: 1024, + }); + expect(result.kind).toBe('single'); + expect(result.framework).toBe('vllm'); + expect(result.command).toContain('vllm serve'); + expect(result.command).toContain('--tensor-parallel-size 8'); + expect(result.command).toContain('--dtype fp8'); + expect(result.command).toContain('--max-num-seqs 64'); + expect(result.command).toContain('--max-model-len 2048'); + // Properly quotes the model when it contains a / + expect(result.command).toContain('deepseek-ai/DeepSeek-R1'); + }); + + it('emits expert-parallel and dp-attention flags when requested', () => { + const result = buildLaunchCommand('vllm', { + model: 'm', + precision: 'fp4', + tp: 8, + ep: 8, + dp_attention: true, + }); + expect(result.command).toContain('--expert-parallel-size 8'); + expect(result.command).toContain('--data-parallel-attention'); + }); + + it('emits a JSON speculative-config when spec_decoding is set', () => { + const result = buildLaunchCommand('vllm', { + model: 'm', + precision: 'fp8', + tp: 4, + spec_decoding: 'mtp', + }); + // JSON gets shell-quoted because of the curly braces / quotes. + expect(result.command).toMatch(/--speculative-config '\{"method":"mtp"\}'/u); + }); + + it('omits speculative-config when spec_decoding is "none"', () => { + const result = buildLaunchCommand('vllm', { + model: 'm', + precision: 'fp8', + tp: 1, + spec_decoding: 'none', + }); + expect(result.command).not.toContain('speculative-config'); + }); + }); + + describe('sglang', () => { + it('builds a basic single-process command', () => { + const result = buildLaunchCommand('sglang', { + model: 'meta-llama/Llama-3.3-70B', + precision: 'fp8', + tp: 4, + conc: 32, + isl: 8192, + osl: 1024, + }); + expect(result.kind).toBe('single'); + expect(result.command).toContain('python -m sglang.launch_server'); + expect(result.command).toContain('--tp 4'); + expect(result.command).toContain('--max-running-requests 32'); + expect(result.command).toContain('--context-length 9216'); + }); + + it('uses --enable-dp-attention for sglang', () => { + const result = buildLaunchCommand('sglang', { + model: 'm', + precision: 'fp8', + tp: 8, + dp_attention: true, + }); + expect(result.command).toContain('--enable-dp-attention'); + }); + + it('emits --speculative-algorithm when spec_decoding is set', () => { + const result = buildLaunchCommand('sglang', { + model: 'm', + precision: 'fp8', + tp: 4, + spec_decoding: 'eagle3', + }); + expect(result.command).toContain('--speculative-algorithm EAGLE3'); + }); + }); + + describe('trt / trtllm alias', () => { + it('treats trtllm as an alias for trt', () => { + const result = buildLaunchCommand('trtllm', { + model: 'm', + precision: 'fp4', + tp: 8, + }); + expect(result.framework).toBe('trt'); + expect(result.command).toContain('trtllm-serve'); + expect(result.command).toContain('--tp_size 8'); + }); + + it('builds a basic trt command', () => { + const result = buildLaunchCommand('trt', { + model: 'm', + precision: 'fp4', + tp: 4, + ep: 4, + conc: 16, + isl: 1024, + osl: 256, + }); + expect(result.command).toContain('--backend pytorch'); + expect(result.command).toContain('--tp_size 4'); + expect(result.command).toContain('--ep_size 4'); + expect(result.command).toContain('--max_batch_size 16'); + expect(result.command).toContain('--max_seq_len 1280'); + expect(result.command).toContain('--kv_cache_dtype fp4'); + }); + + it('emits --speculative_config={"decoding_type":...} for spec', () => { + const result = buildLaunchCommand('trt', { + model: 'm', + precision: 'fp4', + tp: 1, + spec_decoding: 'mtp', + }); + // The flag is a single token because --speculative_config=... has no + // space separator. The shell quoter kicks in because of the curly braces. + expect(result.command).toMatch(/--speculative_config=\{"decoding_type":"MTP"\}/u); + }); + }); + + describe('disagg', () => { + it('returns two commands for vllm disagg with separate prefill / decode TPs', () => { + const result = buildLaunchCommand('vllm', { + model: 'm', + precision: 'fp8', + tp: 8, + disagg: true, + prefill_tp: 4, + prefill_num_workers: 2, + num_prefill_gpu: 8, + decode_tp: 16, + decode_num_workers: 1, + num_decode_gpu: 16, + }); + expect(result.kind).toBe('disagg'); + expect(result.commands).toHaveLength(2); + expect(result.commands?.[0].label).toContain('Prefill'); + expect(result.commands?.[0].command).toContain('--tensor-parallel-size 4'); + expect(result.commands?.[0].command).toContain('--disagg-role prefill'); + expect(result.commands?.[1].label).toContain('Decode'); + expect(result.commands?.[1].command).toContain('--tensor-parallel-size 16'); + expect(result.commands?.[1].command).toContain('--disagg-role decode'); + }); + + it('uses --disaggregate_role for trt disagg', () => { + const result = buildLaunchCommand('trt', { + model: 'm', + precision: 'fp4', + tp: 8, + disagg: true, + prefill_tp: 4, + decode_tp: 8, + }); + expect(result.kind).toBe('disagg'); + expect(result.commands?.[0].command).toContain('--disaggregate_role prefill'); + expect(result.commands?.[1].command).toContain('--disaggregate_role decode'); + }); + + it('falls back to top-level tp when prefill_tp/decode_tp missing', () => { + const result = buildLaunchCommand('sglang', { + model: 'm', + precision: 'fp8', + tp: 4, + disagg: true, + }); + expect(result.kind).toBe('disagg'); + expect(result.commands?.[0].command).toContain('--tp 4'); + expect(result.commands?.[1].command).toContain('--tp 4'); + }); + }); + + describe('compound / orchestrator frameworks → fallback', () => { + const compounds = [ + ['atom', /ATOM/u], + ['mori-sglang', /MoRI/u], + ['dynamo-vllm', /Dynamo vLLM/u], + ['dynamo-trt', /Dynamo TRT/u], + ['dynamo-sglang', /Dynamo SGLang/u], + ] as const; + + it.each(compounds)('returns kind="fallback" for %s', (fw, msgRe) => { + const result = buildLaunchCommand(fw, { + model: 'm', + precision: 'fp8', + tp: 8, + }); + expect(result.kind).toBe('fallback'); + expect(result.framework).toBe(fw); + expect(result.fallbackReason).toMatch(msgRe); + }); + + it('resolves the dynamo-trtllm alias before deciding fallback', () => { + const result = buildLaunchCommand('dynamo-trtllm', { + model: 'm', + precision: 'fp8', + tp: 8, + }); + expect(result.kind).toBe('fallback'); + expect(result.framework).toBe('dynamo-trt'); + }); + }); + + describe('unknown framework', () => { + it('returns a fallback with a clear reason for unknown frameworks', () => { + const result = buildLaunchCommand('made-up-framework', { + model: 'm', + precision: 'fp8', + tp: 1, + }); + expect(result.kind).toBe('fallback'); + expect(result.fallbackReason).toContain('made-up-framework'); + }); + }); + + describe('placeholders for missing fields', () => { + it('uses and placeholders when omitted', () => { + const result = buildLaunchCommand('vllm', { tp: 1 }); + expect(result.command).toContain(''); + expect(result.command).toContain(''); + }); + }); +}); diff --git a/packages/app/src/lib/reproduce-command.ts b/packages/app/src/lib/reproduce-command.ts new file mode 100644 index 00000000..8ff3440b --- /dev/null +++ b/packages/app/src/lib/reproduce-command.ts @@ -0,0 +1,255 @@ +import { resolveFrameworkAlias } from '@semianalysisai/inferencex-constants'; + +/** + * Pure description of a benchmark config — the bits that actually feed the + * launch command. Everything is optional so callers can pass a partial + * `InferenceData` row or a partial `AggDataEntry`; the generators read what + * they need and emit a clear "missing field" comment for anything absent. + */ +export interface ReproduceConfig { + framework: string; + model?: string; + precision?: string; + tp?: number; + ep?: number; + dp_attention?: boolean; + spec_decoding?: string; + disagg?: boolean; + prefill_tp?: number; + prefill_ep?: number; + prefill_dp_attention?: boolean; + prefill_num_workers?: number; + num_prefill_gpu?: number; + decode_tp?: number; + decode_ep?: number; + decode_dp_attention?: boolean; + decode_num_workers?: number; + num_decode_gpu?: number; + conc?: number; + isl?: number; + osl?: number; + image?: string; +} + +export type LaunchCommandKind = 'single' | 'disagg' | 'fallback'; + +/** Result of `buildLaunchCommand`. */ +export interface LaunchCommandResult { + /** "single", "disagg" (prefill + decode workers), or "fallback" (no recipe). */ + kind: LaunchCommandKind; + /** The canonical framework key the command was built for. */ + framework: string; + /** Single-command output (kind === 'single'). */ + command?: string; + /** Disagg output: ordered list of stitched commands. */ + commands?: { label: string; command: string }[]; + /** + * Fallback explanation shown in the drawer when we can't render a launch + * command — typically because the framework is a multi-process orchestrator + * (Dynamo, ATOM, MoRI) or the field set is missing. + */ + fallbackReason?: string; +} + +/** Frameworks that orchestrate multiple worker processes — too much to inline. */ +const COMPOUND_FRAMEWORKS = new Set([ + 'atom', + 'mori-sglang', + 'dynamo-vllm', + 'dynamo-trt', + 'dynamo-sglang', +]); + +const FALLBACK_REASONS: Record = { + atom: 'ATOM orchestrates several worker processes — see the Config JSON tab for the full launch graph.', + 'mori-sglang': + 'MoRI SGLang spans prefill / decode / scheduler workers — see the Config JSON tab for the full launch graph.', + 'dynamo-vllm': + 'Dynamo vLLM is launched via the Dynamo runtime against multiple workers — see the Config JSON tab for the full launch graph.', + 'dynamo-trt': + 'Dynamo TRT is launched via the Dynamo runtime against multiple workers — see the Config JSON tab for the full launch graph.', + 'dynamo-sglang': + 'Dynamo SGLang is launched via the Dynamo runtime against multiple workers — see the Config JSON tab for the full launch graph.', +}; + +/** Format a single-line CLI command from an array of args, escaping where needed. */ +const joinArgs = (args: string[]): string => args.filter(Boolean).map(quoteIfNeeded).join(' '); + +const QUOTE_RE = /[^A-Za-z0-9._\-/=:,@%+]/u; +const quoteIfNeeded = (s: string): string => { + if (s === '') return "''"; + // Already a quoted block (e.g. a multi-flag chunk) — leave as-is. + if (s.includes('\n') || s.startsWith('--')) return s; + if (!QUOTE_RE.test(s)) return s; + return `'${s.replaceAll("'", String.raw`'\''`)}'`; +}; + +/** Format a chunk of CLI args as one indented line per logical group. */ +const formatChunks = (chunks: string[][]): string => + chunks.map((chunk, i) => (i === 0 ? joinArgs(chunk) : ` ${joinArgs(chunk)}`)).join(' \\\n'); + +const baseChunks = (cfg: ReproduceConfig): { precision: string; model: string } => ({ + precision: cfg.precision ?? '', + model: cfg.model ?? '', +}); + +const buildVllmCommand = (cfg: ReproduceConfig): string => { + const { model, precision } = baseChunks(cfg); + const tp = cfg.tp ?? 1; + const flags: string[][] = [ + ['vllm', 'serve', model], + ['--dtype', precision], + ['--tensor-parallel-size', String(tp)], + ]; + if (cfg.ep !== undefined && cfg.ep > 0) { + flags.push(['--expert-parallel-size', String(cfg.ep)]); + } + if (cfg.dp_attention) flags.push(['--data-parallel-attention']); + if (cfg.spec_decoding && cfg.spec_decoding !== 'none') { + flags.push(['--speculative-config', JSON.stringify({ method: cfg.spec_decoding })]); + } + flags.push(['--max-num-seqs', String(cfg.conc ?? 256)]); + if (cfg.isl !== undefined && cfg.osl !== undefined) { + flags.push(['--max-model-len', String(cfg.isl + cfg.osl)]); + } + return formatChunks(flags); +}; + +const buildSglangCommand = (cfg: ReproduceConfig): string => { + const { model, precision } = baseChunks(cfg); + const tp = cfg.tp ?? 1; + const flags: string[][] = [ + ['python', '-m', 'sglang.launch_server'], + ['--model-path', model], + ['--dtype', precision], + ['--tp', String(tp)], + ]; + if (cfg.ep !== undefined && cfg.ep > 0) { + flags.push(['--ep-size', String(cfg.ep)]); + } + if (cfg.dp_attention) flags.push(['--enable-dp-attention']); + if (cfg.spec_decoding && cfg.spec_decoding !== 'none') { + flags.push(['--speculative-algorithm', cfg.spec_decoding.toUpperCase()]); + } + flags.push(['--max-running-requests', String(cfg.conc ?? 256)]); + if (cfg.isl !== undefined && cfg.osl !== undefined) { + flags.push(['--context-length', String(cfg.isl + cfg.osl)]); + } + return formatChunks(flags); +}; + +const buildTrtCommand = (cfg: ReproduceConfig): string => { + const { model, precision } = baseChunks(cfg); + const tp = cfg.tp ?? 1; + const flags: string[][] = [ + ['trtllm-serve', model], + ['--backend', 'pytorch'], + ['--tp_size', String(tp)], + ['--kv_cache_dtype', precision], + ]; + if (cfg.ep !== undefined && cfg.ep > 0) { + flags.push(['--ep_size', String(cfg.ep)]); + } + if (cfg.spec_decoding && cfg.spec_decoding !== 'none') { + flags.push([`--speculative_config={"decoding_type":"${cfg.spec_decoding.toUpperCase()}"}`]); + } + flags.push(['--max_batch_size', String(cfg.conc ?? 256)]); + if (cfg.isl !== undefined && cfg.osl !== undefined) { + flags.push(['--max_seq_len', String(cfg.isl + cfg.osl)]); + } + return formatChunks(flags); +}; + +const SIMPLE_BUILDERS: Record<'vllm' | 'sglang' | 'trt', (cfg: ReproduceConfig) => string> = { + vllm: buildVllmCommand, + sglang: buildSglangCommand, + trt: buildTrtCommand, +}; + +const buildDisaggCommands = ( + cfg: ReproduceConfig, + framework: 'vllm' | 'sglang' | 'trt', +): { label: string; command: string }[] => { + const prefill: ReproduceConfig = { + ...cfg, + tp: cfg.prefill_tp ?? cfg.tp, + ep: cfg.prefill_ep ?? cfg.ep, + dp_attention: cfg.prefill_dp_attention ?? cfg.dp_attention, + }; + const decode: ReproduceConfig = { + ...cfg, + tp: cfg.decode_tp ?? cfg.tp, + ep: cfg.decode_ep ?? cfg.ep, + dp_attention: cfg.decode_dp_attention ?? cfg.dp_attention, + }; + const builder = SIMPLE_BUILDERS[framework]; + // Disagg launch lines append a role flag so the user can paste both into + // separate terminals — this matches how SGLang & vLLM disagg expects + // prefill / decode workers to be tagged. + const roleFlag = framework === 'trt' ? '--disaggregate_role' : '--disagg-role'; + const prefillWorkers = cfg.prefill_num_workers ?? 1; + const decodeWorkers = cfg.decode_num_workers ?? 1; + return [ + { + label: `Prefill workers (×${prefillWorkers}, ${cfg.num_prefill_gpu ?? '?'} GPUs)`, + command: `${builder(prefill)} \\\n ${roleFlag} prefill`, + }, + { + label: `Decode workers (×${decodeWorkers}, ${cfg.num_decode_gpu ?? '?'} GPUs)`, + command: `${builder(decode)} \\\n ${roleFlag} decode`, + }, + ]; +}; + +/** + * Pure function from `(framework, config)` → CLI launch command string. + * + * Returns one of three shapes: + * - `kind: "single"` — a single command (most non-disagg runs). + * - `kind: "disagg"` — two stitched commands for prefill / decode workers. + * - `kind: "fallback"` — no launch command available; the drawer should + * point the user at the Config JSON tab. `fallbackReason` explains why. + * + * The function is intentionally side-effect-free so it can be unit-tested + * per framework and reused for future diffing between runs. + */ +export function buildLaunchCommand( + framework: string, + cfg: Omit, +): LaunchCommandResult { + const canonical = resolveFrameworkAlias(framework); + + if (COMPOUND_FRAMEWORKS.has(canonical)) { + return { + kind: 'fallback', + framework: canonical, + fallbackReason: + FALLBACK_REASONS[canonical] ?? + 'This framework orchestrates several worker processes — see the Config JSON tab.', + }; + } + + if (canonical !== 'vllm' && canonical !== 'sglang' && canonical !== 'trt') { + return { + kind: 'fallback', + framework: canonical, + fallbackReason: `No launch-command recipe is registered for "${canonical}" yet — see the Config JSON tab.`, + }; + } + + const fullCfg: ReproduceConfig = { ...cfg, framework: canonical }; + + if (cfg.disagg) { + return { + kind: 'disagg', + framework: canonical, + commands: buildDisaggCommands(fullCfg, canonical), + }; + } + + return { + kind: 'single', + framework: canonical, + command: SIMPLE_BUILDERS[canonical](fullCfg), + }; +} diff --git a/packages/app/src/lib/reproduce-config.test.ts b/packages/app/src/lib/reproduce-config.test.ts new file mode 100644 index 00000000..44d46648 --- /dev/null +++ b/packages/app/src/lib/reproduce-config.test.ts @@ -0,0 +1,206 @@ +import { describe, expect, it } from 'vitest'; + +import type { InferenceData } from '@/components/inference/types'; + +import { CONFIG_JSON_KEYS, buildReproduceConfig } from './reproduce-config'; + +// Minimal "real-looking" point: every allow-list field populated, plus the +// result-metric fields the old subtractive destructure would have leaked. +function makePoint(overrides: Partial = {}): InferenceData { + return { + // Chart-derived (must not appear in output) + x: 100, + y: 200, + tpPerGpu: { y: 50, roof: false }, + tpPerMw: { y: 5, roof: false }, + costh: { y: 1, roof: false }, + costn: { y: 1, roof: false }, + costr: { y: 1, roof: false }, + costhi: { y: 1, roof: false }, + costni: { y: 1, roof: false }, + costri: { y: 1, roof: false }, + + // Allow-list (must appear in output) + model: 'deepseek-ai/DeepSeek-R1', + framework: 'vllm', + precision: 'fp8', + hw: 'b200', + hwKey: 'b200', + tp: 8, + ep: 8, + dp_attention: true, + disagg: false, + is_multinode: false, + prefill_tp: 0, + prefill_ep: 0, + prefill_dp_attention: false, + prefill_num_workers: 0, + decode_tp: 0, + decode_ep: 0, + decode_dp_attention: false, + decode_num_workers: 0, + num_prefill_gpu: 0, + num_decode_gpu: 0, + spec_decoding: 'mtp', + conc: 64, + image: 'vllm/vllm-openai:v0.6.4', + date: '2025-12-01', + actualDate: '2025-12-01', + run_url: 'https://github.com/InferenceMAX/InferenceMAX/actions/runs/123', + + // Result metrics (must not appear in output) + tput_per_gpu: 1234.5, + output_tput_per_gpu: 567.8, + input_tput_per_gpu: 666.7, + mean_ttft: 100, + median_ttft: 95, + std_ttft: 5, + p99_ttft: 150, + mean_tpot: 10, + median_tpot: 9, + std_tpot: 1, + p99_tpot: 20, + mean_intvty: 0.1, + median_intvty: 0.09, + std_intvty: 0.01, + p99_intvty: 0.2, + mean_itl: 5, + median_itl: 4, + std_itl: 1, + p99_itl: 10, + mean_e2el: 200, + median_e2el: 190, + std_e2el: 10, + p99_e2el: 300, + + ...overrides, + }; +} + +const RESULT_KEYS = [ + 'tput_per_gpu', + 'output_tput_per_gpu', + 'input_tput_per_gpu', + 'mean_ttft', + 'median_ttft', + 'std_ttft', + 'p99_ttft', + 'mean_tpot', + 'median_tpot', + 'std_tpot', + 'p99_tpot', + 'mean_intvty', + 'median_intvty', + 'std_intvty', + 'p99_intvty', + 'mean_itl', + 'median_itl', + 'std_itl', + 'p99_itl', + 'mean_e2el', + 'median_e2el', + 'std_e2el', + 'p99_e2el', +]; + +const CHART_DERIVED_KEYS = [ + 'x', + 'y', + 'tpPerGpu', + 'tpPerMw', + 'outputTputPerGpu', + 'inputTputPerGpu', + 'outputTputPerMw', + 'inputTputPerMw', + 'costh', + 'costn', + 'costr', + 'costhi', + 'costni', + 'costri', + 'costhOutput', + 'costnOutput', + 'costrOutput', + 'costUser', + 'powerUser', + 'jTotal', + 'jOutput', + 'jInput', +]; + +describe('buildReproduceConfig', () => { + it('includes every allow-list key when populated', () => { + const out = buildReproduceConfig(makePoint()); + for (const key of CONFIG_JSON_KEYS) { + expect(out, `missing allow-list key "${key}"`).toHaveProperty(key); + } + }); + + it('omits raw result-metric fields', () => { + const out = buildReproduceConfig(makePoint()); + for (const key of RESULT_KEYS) { + expect(out, `result key "${key}" leaked into config JSON`).not.toHaveProperty(key); + } + }); + + it('omits chart-derived presentational fields', () => { + const out = buildReproduceConfig(makePoint()); + for (const key of CHART_DERIVED_KEYS) { + expect(out, `chart-derived key "${key}" leaked into config JSON`).not.toHaveProperty(key); + } + }); + + it('omits undefined / null allow-list fields (no "image: null" noise)', () => { + const out = buildReproduceConfig( + makePoint({ image: undefined, actualDate: undefined, ep: undefined }), + ); + expect(out).not.toHaveProperty('image'); + expect(out).not.toHaveProperty('actualDate'); + expect(out).not.toHaveProperty('ep'); + // Sibling required fields still present. + expect(out).toHaveProperty('model'); + expect(out).toHaveProperty('framework'); + }); + + it('preserves boolean false and numeric 0 (does not treat them as absent)', () => { + const out = buildReproduceConfig( + makePoint({ disagg: false, is_multinode: false, prefill_tp: 0 }), + ); + expect(out.disagg).toBe(false); + expect(out.is_multinode).toBe(false); + expect(out.prefill_tp).toBe(0); + }); + + it('emits keys in CONFIG_JSON_KEYS order in the stringified output', () => { + const json = JSON.stringify(buildReproduceConfig(makePoint()), null, 2); + const indices = CONFIG_JSON_KEYS.map((k) => json.indexOf(`"${k}":`)); + for (let i = 1; i < indices.length; i += 1) { + expect(indices[i], `key order broken at "${CONFIG_JSON_KEYS[i]}"`).toBeGreaterThan( + indices[i - 1]!, + ); + } + }); + + describe('with sequence', () => { + it('inserts isl/osl just before conc when sequence is provided', () => { + const json = JSON.stringify( + buildReproduceConfig(makePoint(), { isl: 1024, osl: 1024 }), + null, + 2, + ); + const iIsl = json.indexOf('"isl":'); + const iOsl = json.indexOf('"osl":'); + const iConc = json.indexOf('"conc":'); + const iSpec = json.indexOf('"spec_decoding":'); + expect(iIsl).toBeGreaterThan(iSpec); + expect(iOsl).toBeGreaterThan(iIsl); + expect(iConc).toBeGreaterThan(iOsl); + }); + + it('does not emit isl/osl when sequence is omitted', () => { + const out = buildReproduceConfig(makePoint()); + expect(out).not.toHaveProperty('isl'); + expect(out).not.toHaveProperty('osl'); + }); + }); +}); diff --git a/packages/app/src/lib/reproduce-config.ts b/packages/app/src/lib/reproduce-config.ts new file mode 100644 index 00000000..2414c754 --- /dev/null +++ b/packages/app/src/lib/reproduce-config.ts @@ -0,0 +1,75 @@ +import type { InferenceData } from '@/components/inference/types'; + +/** + * Allow-list of config / identity / provenance fields surfaced in the + * Reproduce drawer's "Config JSON" tab. + * + * The previous implementation subtractively destructured chart-derived + * presentational fields off the point and stringified the rest, which left + * every raw `metrics` value (TTFT / TPOT / ITL / E2EL / INTVTY percentiles + * plus throughput) mixed in alongside the launch config. Those are *results*, + * not inputs — copying them back as a "future config diff" is meaningless. + * + * Order is significant: `JSON.stringify` preserves insertion order for + * string keys, so a fixed array order makes the output stable and + * diff-friendly across runs. + */ +export const CONFIG_JSON_KEYS = [ + 'model', + 'framework', + 'precision', + 'hw', + 'hwKey', + 'tp', + 'ep', + 'dp_attention', + 'disagg', + 'is_multinode', + 'prefill_tp', + 'prefill_ep', + 'prefill_dp_attention', + 'prefill_num_workers', + 'decode_tp', + 'decode_ep', + 'decode_dp_attention', + 'decode_num_workers', + 'num_prefill_gpu', + 'num_decode_gpu', + 'spec_decoding', + 'conc', + 'image', + 'date', + 'actualDate', + 'run_url', +] as const satisfies readonly (keyof InferenceData)[]; + +/** + * Build the JSON-serializable config object for a point. Pure function so it + * is unit-testable and reusable for a future "diff between runs of the same + * config" feature. + * + * `sequence` is an optional second argument because `isl` / `osl` are not + * stored on the chart point — they live on the active sequence selection. + * Including them in the output keeps the JSON a complete description of + * "what was run". + */ +export function buildReproduceConfig( + point: InferenceData, + sequence?: { isl: number; osl: number }, +): Record { + const out: Record = {}; + for (const key of CONFIG_JSON_KEYS) { + const value = point[key]; + if (value === undefined || value === null) continue; + if (key === 'conc' && sequence) { + out.isl = sequence.isl; + out.osl = sequence.osl; + } + out[key] = value; + } + if (sequence && !('isl' in out)) { + out.isl = sequence.isl; + out.osl = sequence.osl; + } + return out; +} diff --git a/packages/db/migrations/005_benchmark_environments.sql b/packages/db/migrations/005_benchmark_environments.sql new file mode 100644 index 00000000..ac88724f --- /dev/null +++ b/packages/db/migrations/005_benchmark_environments.sql @@ -0,0 +1,56 @@ +-- ============================================================ +-- BENCHMARK ENVIRONMENTS +-- One row per (workflow_run_id, config_id) capturing the runtime +-- environment a benchmark was produced in. +-- +-- Two ingest paths feed this table: +-- * env_json — authoritative, from upstream CI's env.json artifact +-- (nvidia-smi / rocm-smi / nvcc / git rev-parse output). +-- * log_parse — fallback, regex over server_logs.server_log for the +-- framework/torch/python version strings that frameworks +-- print on startup. Host-level fields stay NULL. +-- +-- The drawer's Environment tab joins through (workflow_run_id, config_id) +-- so the data persists across re-ingest and works for every benchmark +-- row of a given config (no per-conc duplication). +-- ============================================================ + +create table benchmark_environments ( + id bigserial primary key, + workflow_run_id bigint not null references workflow_runs(id) on delete cascade, + config_id integer not null references configs(id), + + image text, + + -- Framework + framework_version text, + framework_sha text, + + -- Toolchain + torch_version text, + python_version text, + cuda_version text, + rocm_version text, + + -- Host + driver_version text, + gpu_sku text, + + -- Provenance: which ingest path populated this row. + -- env_json wins permanently; log_parse only fills NULL columns. + source text not null default 'log_parse', + + -- Forward-compat for fields the parser captures that don't yet have a + -- dedicated column (e.g. framework-specific build flags, ROCm-only fields). + extra jsonb not null default '{}'::jsonb, + + parsed_at timestamptz not null default now(), + + constraint benchmark_environments_source_check + check (source in ('env_json', 'log_parse')), + constraint benchmark_environments_unique + unique (workflow_run_id, config_id) +); + +create index benchmark_environments_workflow_run_idx + on benchmark_environments (workflow_run_id); diff --git a/packages/db/src/etl/env-ingest.test.ts b/packages/db/src/etl/env-ingest.test.ts new file mode 100644 index 00000000..ae2d178d --- /dev/null +++ b/packages/db/src/etl/env-ingest.test.ts @@ -0,0 +1,106 @@ +import { describe, it, expect } from 'vitest'; + +import type { ParsedEnv } from './env-parser'; +import { upsertBenchmarkEnvironment } from './env-ingest'; + +/** + * Behavior tests for the upsert helper. + * + * These don't hit a real database — the postgres.js client supports a + * tagged-template call shape, so a small mock that records the assembled + * SQL fragment lets us assert the conflict-resolution contract: + * + * - env_json → unconditional overwrite of every column + * - log_parse → COALESCE: only fills NULL columns, never demotes source + */ + +interface RecordedCall { + sql: string; + values: unknown[]; +} + +function makeMockSql() { + const calls: RecordedCall[] = []; + // The tag function only needs to accept a strings array + interpolated + // values and return a thenable; the helper ignores the return value. + const mockSql = ((strings: TemplateStringsArray, ...values: unknown[]) => { + calls.push({ sql: strings.join('?'), values }); + return Promise.resolve([]); + }) as unknown as Parameters[0]; + return { mockSql, calls }; +} + +const fullEnvJson: ParsedEnv = { + source: 'env_json', + frameworkVersion: '1.3.0rc11', + frameworkSha: 'e136d70cdc6101007017c05d57fb4cec5d6ed98f', + torchVersion: '2.5.1+cu124', + pythonVersion: '3.12.7', + cudaVersion: '12.4', + rocmVersion: null, + driverVersion: '560.35.03', + gpuSku: 'NVIDIA H100 80GB HBM3', + extra: { nccl_version: '2.21.5' }, +}; + +const logParseOnly: ParsedEnv = { + source: 'log_parse', + frameworkVersion: '1.3.0rc11', + frameworkSha: null, + torchVersion: '2.11.0a0+eb65b36914', + pythonVersion: '3.12', + cudaVersion: null, + rocmVersion: null, + driverVersion: null, + gpuSku: null, + extra: {}, +}; + +describe('upsertBenchmarkEnvironment', () => { + it('issues an unconditional overwrite for env_json', async () => { + const { mockSql, calls } = makeMockSql(); + await upsertBenchmarkEnvironment(mockSql, 42, 7, 'nvcr.io/foo:1', fullEnvJson); + expect(calls).toHaveLength(1); + const { sql } = calls[0]; + // env_json branch overwrites without COALESCE + expect(sql).not.toMatch(/coalesce/iu); + expect(sql).toMatch(/source\s*=\s*'env_json'/u); + expect(sql).toMatch(/on conflict \(workflow_run_id, config_id\) do update set/iu); + }); + + it('uses COALESCE on update for log_parse so it never clobbers env_json', async () => { + const { mockSql, calls } = makeMockSql(); + await upsertBenchmarkEnvironment(mockSql, 42, 7, null, logParseOnly); + expect(calls).toHaveLength(1); + const { sql } = calls[0]; + expect(sql).toMatch( + /coalesce\(benchmark_environments\.framework_version,\s*excluded\.framework_version\)/iu, + ); + expect(sql).toMatch( + /coalesce\(benchmark_environments\.driver_version,\s*excluded\.driver_version\)/iu, + ); + expect(sql).toMatch(/coalesce\(benchmark_environments\.gpu_sku,\s*excluded\.gpu_sku\)/iu); + // log_parse must NOT update the `source` column on conflict — that + // would demote an existing env_json row. + expect(sql).not.toMatch(/source\s*=\s*'log_parse'/u); + }); + + it('binds all parsed fields as parameterized values', async () => { + const { mockSql, calls } = makeMockSql(); + await upsertBenchmarkEnvironment(mockSql, 42, 7, 'img:tag', fullEnvJson); + const vals = calls[0].values; + // Spot-check a representative subset of parameters + expect(vals).toContain(42); + expect(vals).toContain(7); + expect(vals).toContain('img:tag'); + expect(vals).toContain('1.3.0rc11'); + expect(vals).toContain('NVIDIA H100 80GB HBM3'); + expect(vals).toContain(JSON.stringify({ nccl_version: '2.21.5' })); + }); + + it('serializes empty extra to {} for log_parse', async () => { + const { mockSql, calls } = makeMockSql(); + await upsertBenchmarkEnvironment(mockSql, 42, 7, null, logParseOnly); + expect(calls[0].values).toContain('{}'); + }); +}); diff --git a/packages/db/src/etl/env-ingest.ts b/packages/db/src/etl/env-ingest.ts new file mode 100644 index 00000000..c66aa9d9 --- /dev/null +++ b/packages/db/src/etl/env-ingest.ts @@ -0,0 +1,87 @@ +/** + * Idempotent upsert into `benchmark_environments`. + * + * Conflict-resolution rule (per `005_benchmark_environments.sql`): + * - source = 'env_json': overwrite every column. env.json is authoritative. + * - source = 'log_parse': only fill columns that are still NULL (COALESCE), + * and never demote a row that's already env_json. + * + * This means log-parse fallback from ingest and authoritative env.json ingest + * can run in any order without clobbering authoritative data. + */ + +import type postgres from 'postgres'; + +import type { ParsedEnv } from './env-parser'; + +type Sql = ReturnType; + +export async function upsertBenchmarkEnvironment( + sql: Sql, + workflowRunId: number, + configId: number, + image: string | null, + parsed: ParsedEnv, +): Promise { + const extraJson = JSON.stringify(parsed.extra ?? {}); + + if (parsed.source === 'env_json') { + await sql` + insert into benchmark_environments ( + workflow_run_id, config_id, + image, framework_version, framework_sha, + torch_version, python_version, cuda_version, rocm_version, + driver_version, gpu_sku, source, extra, parsed_at + ) values ( + ${workflowRunId}, ${configId}, + ${image}, ${parsed.frameworkVersion}, ${parsed.frameworkSha}, + ${parsed.torchVersion}, ${parsed.pythonVersion}, ${parsed.cudaVersion}, ${parsed.rocmVersion}, + ${parsed.driverVersion}, ${parsed.gpuSku}, 'env_json', ${extraJson}::jsonb, now() + ) + on conflict (workflow_run_id, config_id) do update set + image = excluded.image, + framework_version = excluded.framework_version, + framework_sha = excluded.framework_sha, + torch_version = excluded.torch_version, + python_version = excluded.python_version, + cuda_version = excluded.cuda_version, + rocm_version = excluded.rocm_version, + driver_version = excluded.driver_version, + gpu_sku = excluded.gpu_sku, + source = 'env_json', + extra = excluded.extra, + parsed_at = now() + `; + return; + } + + // log_parse: never overwrite an existing env_json row; only fill NULL + // columns. We rely on COALESCE in the DO UPDATE clause so a subsequent + // env_json insert can still upgrade us, and an earlier env_json insert + // is fully preserved. + await sql` + insert into benchmark_environments ( + workflow_run_id, config_id, + image, framework_version, framework_sha, + torch_version, python_version, cuda_version, rocm_version, + driver_version, gpu_sku, source, extra, parsed_at + ) values ( + ${workflowRunId}, ${configId}, + ${image}, ${parsed.frameworkVersion}, ${parsed.frameworkSha}, + ${parsed.torchVersion}, ${parsed.pythonVersion}, ${parsed.cudaVersion}, ${parsed.rocmVersion}, + ${parsed.driverVersion}, ${parsed.gpuSku}, 'log_parse', ${extraJson}::jsonb, now() + ) + on conflict (workflow_run_id, config_id) do update set + image = coalesce(benchmark_environments.image, excluded.image), + framework_version = coalesce(benchmark_environments.framework_version, excluded.framework_version), + framework_sha = coalesce(benchmark_environments.framework_sha, excluded.framework_sha), + torch_version = coalesce(benchmark_environments.torch_version, excluded.torch_version), + python_version = coalesce(benchmark_environments.python_version, excluded.python_version), + cuda_version = coalesce(benchmark_environments.cuda_version, excluded.cuda_version), + rocm_version = coalesce(benchmark_environments.rocm_version, excluded.rocm_version), + driver_version = coalesce(benchmark_environments.driver_version, excluded.driver_version), + gpu_sku = coalesce(benchmark_environments.gpu_sku, excluded.gpu_sku), + -- source stays whatever it already was; log_parse can't downgrade env_json + parsed_at = now() + `; +} diff --git a/packages/db/src/etl/env-json-reader.test.ts b/packages/db/src/etl/env-json-reader.test.ts new file mode 100644 index 00000000..e2d2b0a8 --- /dev/null +++ b/packages/db/src/etl/env-json-reader.test.ts @@ -0,0 +1,90 @@ +import { describe, it, expect } from 'vitest'; + +import { readEnvJson } from './env-json-reader'; + +describe('readEnvJson', () => { + it('parses a full env.json into ParsedEnv with source = env_json', () => { + const text = JSON.stringify({ + framework: 'sglang', + framework_version: '0.4.3.post2', + framework_sha: 'e136d70cdc6101007017c05d57fb4cec5d6ed98f', + image: 'lmsysorg/sglang:latest', + torch: '2.5.1+cu124', + python: '3.12.7', + cuda: '12.4', + rocm: null, + driver: '560.35.03', + gpu_sku: 'NVIDIA H100 80GB HBM3', + }); + const env = readEnvJson(text); + expect(env).toEqual({ + source: 'env_json', + frameworkVersion: '0.4.3.post2', + frameworkSha: 'e136d70cdc6101007017c05d57fb4cec5d6ed98f', + torchVersion: '2.5.1+cu124', + pythonVersion: '3.12.7', + cudaVersion: '12.4', + rocmVersion: null, + driverVersion: '560.35.03', + gpuSku: 'NVIDIA H100 80GB HBM3', + extra: {}, + }); + }); + + it('handles AMD-style env.json (rocm set, cuda null)', () => { + const env = readEnvJson( + JSON.stringify({ + framework: 'sglang', + rocm: '6.2.0', + driver: '6.7.0', + gpu_sku: 'AMD Instinct MI355X', + }), + ); + expect(env.rocmVersion).toBe('6.2.0'); + expect(env.cudaVersion).toBeNull(); + expect(env.gpuSku).toBe('AMD Instinct MI355X'); + }); + + it('treats missing fields as null', () => { + const env = readEnvJson('{}'); + expect(env.source).toBe('env_json'); + expect(env.frameworkVersion).toBeNull(); + expect(env.frameworkSha).toBeNull(); + expect(env.torchVersion).toBeNull(); + expect(env.pythonVersion).toBeNull(); + expect(env.cudaVersion).toBeNull(); + expect(env.rocmVersion).toBeNull(); + expect(env.driverVersion).toBeNull(); + expect(env.gpuSku).toBeNull(); + }); + + it('treats empty strings the same as null', () => { + const env = readEnvJson(JSON.stringify({ cuda: '', driver: ' ' })); + expect(env.cudaVersion).toBeNull(); + expect(env.driverVersion).toBeNull(); + }); + + it('captures unknown fields on extra', () => { + const env = readEnvJson( + JSON.stringify({ + framework_version: '1.0', + nccl_version: '2.21.5', + kernel: '6.5.0-generic', + }), + ); + expect(env.extra).toEqual({ + nccl_version: '2.21.5', + kernel: '6.5.0-generic', + }); + }); + + it('throws on non-object input', () => { + expect(() => readEnvJson('[]')).toThrow(/must be a JSON object/u); + expect(() => readEnvJson('"foo"')).toThrow(); + expect(() => readEnvJson('42')).toThrow(); + }); + + it('throws on malformed JSON', () => { + expect(() => readEnvJson('{ not json')).toThrow(); + }); +}); diff --git a/packages/db/src/etl/env-json-reader.ts b/packages/db/src/etl/env-json-reader.ts new file mode 100644 index 00000000..d6b4446a --- /dev/null +++ b/packages/db/src/etl/env-json-reader.ts @@ -0,0 +1,79 @@ +/** + * Reader for the upstream CI `env.json` artifact — the AUTHORITATIVE source + * for the Reproduce Drawer's Environment tab. + * + * The upstream PR in `SemiAnalysisAI/InferenceX` writes this file next to + * `server.log` inside each `server_logs_/` artifact directory. + * Our ingest path (`ingest-ci-run.ts`) prefers this when present and falls + * back to `parseServerLogEnv()` otherwise. + * + * Contract (all fields optional; missing or null fields stay null in the + * resulting `ParsedEnv`): + * + * { + * "framework": "sglang", + * "framework_version": "0.4.3.post2", + * "framework_sha": "e136d70cdc6101007017c05d57fb4cec5d6ed98f", + * "image": "lmsysorg/sglang:latest", + * "torch": "2.5.1+cu124", + * "python": "3.12.7", + * "cuda": "12.4", + * "rocm": null, + * "driver": "560.35.03", + * "gpu_sku": "NVIDIA H100 80GB HBM3" + * } + * + * Any keys not listed above are preserved on `extra` so the upstream CI + * can add fields without an app-side schema change. + */ + +import type { ParsedEnv } from './env-parser'; + +const KNOWN_KEYS = new Set([ + 'framework', + 'framework_version', + 'framework_sha', + 'image', + 'torch', + 'python', + 'cuda', + 'rocm', + 'driver', + 'gpu_sku', +]); + +function str(v: unknown): string | null { + if (typeof v !== 'string') return null; + const trimmed = v.trim(); + return trimmed.length > 0 ? trimmed : null; +} + +/** + * Parse the raw `env.json` text into a `ParsedEnv`. Throws if the input is + * not valid JSON or is not an object — callers should `try/catch` and fall + * back to `parseServerLogEnv()` so a malformed artifact never blocks ingest. + */ +export function readEnvJson(envJsonText: string): ParsedEnv { + const raw = JSON.parse(envJsonText) as Record; + if (raw === null || typeof raw !== 'object' || Array.isArray(raw)) { + throw new TypeError('env.json must be a JSON object'); + } + + const extra: Record = {}; + for (const [k, v] of Object.entries(raw)) { + if (!KNOWN_KEYS.has(k)) extra[k] = v; + } + + return { + source: 'env_json', + frameworkVersion: str(raw.framework_version), + frameworkSha: str(raw.framework_sha), + torchVersion: str(raw.torch), + pythonVersion: str(raw.python), + cudaVersion: str(raw.cuda), + rocmVersion: str(raw.rocm), + driverVersion: str(raw.driver), + gpuSku: str(raw.gpu_sku), + extra, + }; +} diff --git a/packages/db/src/etl/env-parser.test.ts b/packages/db/src/etl/env-parser.test.ts new file mode 100644 index 00000000..64c12b7e --- /dev/null +++ b/packages/db/src/etl/env-parser.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect } from 'vitest'; + +import { parseServerLogEnv } from './env-parser'; + +// Fixtures lifted from the live read-write DB fork (single representative +// preamble line per framework). Keep them minimal — the parser must work +// on a small slice, not require the full log. + +describe('parseServerLogEnv', () => { + describe('trt / dynamo-trt', () => { + const log = [ + '/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: …', + 'Skipping import of cpp extensions due to incompatible torch version 2.11.0a0+eb65b36914.nv26.02 for torchao version 0.15.0', + '[TensorRT-LLM] TensorRT LLM version: 1.3.0rc11', + ].join('\n'); + + it('extracts the TensorRT-LLM version', () => { + const env = parseServerLogEnv(log, 'trt'); + expect(env.frameworkVersion).toBe('1.3.0rc11'); + }); + + it('treats dynamo-trt the same as trt', () => { + const env = parseServerLogEnv(log, 'dynamo-trt'); + expect(env.frameworkVersion).toBe('1.3.0rc11'); + }); + + it('captures torch + python versions across frameworks', () => { + const env = parseServerLogEnv(log, 'trt'); + expect(env.torchVersion).toBe('2.11.0a0+eb65b36914.nv26.02'); + expect(env.pythonVersion).toBe('3.12'); + }); + }); + + describe('vllm / dynamo-vllm', () => { + const log = [ + '(APIServer pid=2163842) INFO 05-06 21:00:42 [utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0', + '(APIServer pid=2163842) INFO 05-06 21:00:42 [utils.py:233] non-default args: …', + ].join('\n'); + + it('extracts the vLLM version', () => { + const env = parseServerLogEnv(log, 'vllm'); + expect(env.frameworkVersion).toBe('0.19.0'); + }); + + it('treats dynamo-vllm the same as vllm', () => { + const env = parseServerLogEnv(log, 'dynamo-vllm'); + expect(env.frameworkVersion).toBe('0.19.0'); + }); + }); + + describe('sglang / atom (no version line today)', () => { + const log = [ + '[2026-05-10 17:27:39] server_args=ServerArgs(model_path=…, tp_size=4, …)', + 'python3.10 site-packages …', + ].join('\n'); + + it('returns null framework version for sglang', () => { + const env = parseServerLogEnv(log, 'sglang'); + expect(env.frameworkVersion).toBeNull(); + }); + + it('returns null framework version for atom', () => { + const env = parseServerLogEnv(log, 'atom'); + expect(env.frameworkVersion).toBeNull(); + }); + + it('still captures python version', () => { + const env = parseServerLogEnv(log, 'sglang'); + expect(env.pythonVersion).toBe('3.10'); + }); + }); + + describe('contract', () => { + it('always tags source = log_parse', () => { + expect(parseServerLogEnv('', 'trt').source).toBe('log_parse'); + expect(parseServerLogEnv('arbitrary', 'sglang').source).toBe('log_parse'); + }); + + it('returns all nulls (and extra={}) for an empty log', () => { + const env = parseServerLogEnv('', 'trt'); + expect(env.frameworkVersion).toBeNull(); + expect(env.frameworkSha).toBeNull(); + expect(env.torchVersion).toBeNull(); + expect(env.pythonVersion).toBeNull(); + expect(env.cudaVersion).toBeNull(); + expect(env.rocmVersion).toBeNull(); + expect(env.driverVersion).toBeNull(); + expect(env.gpuSku).toBeNull(); + expect(env.extra).toEqual({}); + }); + + it('never fills host-level fields from logs', () => { + // Even when an nvidia-smi-shaped block accidentally appears in a log, + // we deliberately do not parse it — logs are not authoritative for + // these fields. + const log = [ + '[TensorRT-LLM] TensorRT LLM version: 1.3.0rc11', + 'NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4', + ].join('\n'); + const env = parseServerLogEnv(log, 'trt'); + expect(env.driverVersion).toBeNull(); + expect(env.cudaVersion).toBeNull(); + expect(env.gpuSku).toBeNull(); + }); + + it('handles unknown framework keys gracefully', () => { + const env = parseServerLogEnv('python3.11 something', 'someNewFramework'); + expect(env.frameworkVersion).toBeNull(); + expect(env.pythonVersion).toBe('3.11'); + }); + }); +}); diff --git a/packages/db/src/etl/env-parser.ts b/packages/db/src/etl/env-parser.ts new file mode 100644 index 00000000..c3789479 --- /dev/null +++ b/packages/db/src/etl/env-parser.ts @@ -0,0 +1,109 @@ +/** + * Pure parser that extracts environment metadata from a server.log preamble. + * + * This is the FALLBACK path for the Reproduce Drawer's Environment tab — + * authoritative data comes from upstream CI's `env.json` artifact (parsed by + * `env-json-reader.ts`). The log parser exists so historical rows (and any + * future run that drops env.json) still surface partial data instead of + * nothing. + * + * Host-level fields (driver / CUDA / GPU SKU / framework SHA) are NEVER + * filled from logs — no current framework prints `nvidia-smi`/`rocm-smi` or + * `git rev-parse` output. The parser returns `null` for them so the + * downstream UI can render `(not recorded)` until env.json is available. + * + * The function is pure and synchronous; framework-keyed regexes live in the + * `FRAMEWORK_VERSION_PATTERNS` table so adding a new framework only requires + * one entry and one fixture in the companion test file. + */ + +/** Shape shared with `env-json-reader.ts`. */ +export interface ParsedEnv { + source: 'env_json' | 'log_parse'; + frameworkVersion: string | null; + frameworkSha: string | null; + torchVersion: string | null; + pythonVersion: string | null; + cudaVersion: string | null; + rocmVersion: string | null; + driverVersion: string | null; + gpuSku: string | null; + extra: Record; +} + +/** + * Framework → regex that captures the version string on the framework's first + * startup log line. Each pattern is anchored on a unique marker so we don't + * accidentally match a transient warning later in the log. + * + * Frameworks without a known version line map to `null`. When upstream adds + * one (e.g. SGLang or Atom start logging their version), drop a regex in + * here and add a fixture to `env-parser.test.ts`. + */ +const FRAMEWORK_VERSION_PATTERNS: Record = { + trt: /\[TensorRT-LLM\]\s+TensorRT LLM version:\s*(\S+)/u, + 'dynamo-trt': /\[TensorRT-LLM\]\s+TensorRT LLM version:\s*(\S+)/u, + // vLLM renders the "vLLM" banner as ASCII art, so we can't anchor on the + // literal name. The `Initializing a V[N] LLM engine (vX.Y.Z)` line is + // emitted by every recent vLLM and is unambiguous. As a secondary, the + // banner line itself ends with `version X.Y.Z` and lives at `utils.py:299` + // / `:233` — anchor on that file marker so we don't false-match unrelated + // "version X.Y.Z" mentions later in the log. + vllm: /(?:Initializing a V\d+ LLM engine \(v|\[utils\.py:\d+\][^\n]*?version\s+)(\d+\.\d+\.\d+[^\s)]*)/iu, + 'dynamo-vllm': + /(?:Initializing a V\d+ LLM engine \(v|\[utils\.py:\d+\][^\n]*?version\s+)(\d+\.\d+\.\d+[^\s)]*)/iu, + sglang: null, + 'mori-sglang': null, + 'dynamo-sglang': null, + atom: null, +}; + +/** Cross-framework torch version line: `incompatible torch version 2.11.0a0+...`. */ +const TORCH_VERSION_PATTERN = /\btorch version\s+(\S+?)(?=\s|$)/iu; + +/** Cross-framework python detection: `python3.12` / `python3.10`. */ +const PYTHON_VERSION_PATTERN = /\bpython(\d+\.\d+)\b/iu; + +/** + * Parse the env-relevant fields out of a server.log. + * + * The `framework` argument MUST be the normalized framework key (lowercase, + * already passed through `normalizeFramework()` upstream). Unknown framework + * keys are treated as "no version pattern" and return null for + * `frameworkVersion` — they still get torch/python parsing. + */ +export function parseServerLogEnv(log: string, framework: string): ParsedEnv { + const out: ParsedEnv = { + source: 'log_parse', + frameworkVersion: null, + frameworkSha: null, + torchVersion: null, + pythonVersion: null, + cudaVersion: null, + rocmVersion: null, + driverVersion: null, + gpuSku: null, + extra: {}, + }; + + if (!log) return out; + + const fwPattern = FRAMEWORK_VERSION_PATTERNS[framework]; + if (fwPattern) { + const m = log.match(fwPattern); + if (m && m[1]) out.frameworkVersion = m[1]; + } + + const torchMatch = log.match(TORCH_VERSION_PATTERN); + if (torchMatch && torchMatch[1]) { + // Strip trailing punctuation that sometimes follows the version (e.g. + // "torch version 2.9.0a0+145a3a7bda.nv25.10 for torchao..." → keep the + // version, drop the trailing word "for"). + out.torchVersion = torchMatch[1].replace(/[.,;:]+$/u, ''); + } + + const pythonMatch = log.match(PYTHON_VERSION_PATTERN); + if (pythonMatch && pythonMatch[1]) out.pythonVersion = pythonMatch[1]; + + return out; +} diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index c345e662..070e52c9 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -45,6 +45,9 @@ import { bulkUpsertAvailability, insertServerLog, } from './etl/benchmark-ingest'; +import { parseServerLogEnv, type ParsedEnv } from './etl/env-parser'; +import { readEnvJson } from './etl/env-json-reader'; +import { upsertBenchmarkEnvironment } from './etl/env-ingest'; import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper'; import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; @@ -426,6 +429,41 @@ async function main(): Promise { try { const serverLog = fs.readFileSync(logPath, 'utf8').replaceAll('\u0000', ''); await insertServerLog(sql, insertedIds, serverLog); + + // Populate benchmark_environments for every distinct config + // in this artifact. env.json (next to server.log in the same + // server_logs_/ dir) is authoritative; the server-log + // parser is the fallback. Failures here MUST NOT abort the + // benchmark ingest — env data is supplementary. + const envJsonPath = path.join(path.dirname(logPath), 'env.json'); + let envJsonParsed: ParsedEnv | null = null; + if (fs.existsSync(envJsonPath)) { + try { + envJsonParsed = readEnvJson(fs.readFileSync(envJsonPath, 'utf8')); + } catch (error: any) { + console.warn( + ` [WARN] failed to parse env.json for ${configKey}: ${error.message}`, + ); + } + } + + const seenConfigIds = new Set(); + for (const r of toInsert) { + if (seenConfigIds.has(r.configId)) continue; + seenConfigIds.add(r.configId); + const parsed = envJsonParsed ?? parseServerLogEnv(serverLog, r.config.framework); + try { + await upsertBenchmarkEnvironment( + sql, + workflowRunId, + r.configId, + r.image, + parsed, + ); + } catch (error: any) { + tracker.recordDbError(`env for ${configKey}`, error); + } + } } catch (error: any) { tracker.recordDbError(`server_log for ${configKey}`, error); } diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index 9c17bfaf..8f317ee9 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -44,6 +44,9 @@ import { bulkUpsertAvailability, insertServerLog, } from './etl/benchmark-ingest'; +import { parseServerLogEnv, type ParsedEnv } from './etl/env-parser'; +import { readEnvJson } from './etl/env-json-reader'; +import { upsertBenchmarkEnvironment } from './etl/env-ingest'; import { mapEvalRow, mapAggEvalRow, type EvalParams } from './etl/eval-mapper'; import { ingestEvalRow } from './etl/eval-ingest'; import { mapEvalSamples } from './etl/eval-samples-mapper'; @@ -610,6 +613,33 @@ async function main(): Promise { // Strip null bytes — some logs contain 0x00 which PostgreSQL text columns reject const clean = serverLog.replaceAll('\u0000', ''); await insertServerLog(sql, insertedIds, clean); + + // Populate benchmark_environments. env.json (next to + // server.log inside the same ZIP) is authoritative; the + // server-log parser is the fallback. Errors here MUST NOT + // abort the benchmark ingest. + const envJsonText = readZipText(serverLogPath, 'env.json'); + let envJsonParsed: ParsedEnv | null = null; + if (envJsonText) { + try { + envJsonParsed = readEnvJson(envJsonText); + } catch (error: any) { + console.warn( + ` [WARN] failed to parse env.json in ${path.basename(serverLogPath)}: ${error.message}`, + ); + } + } + const seenConfigIds = new Set(); + for (const r of toInsert) { + if (seenConfigIds.has(r.configId)) continue; + seenConfigIds.add(r.configId); + const parsed = envJsonParsed ?? parseServerLogEnv(clean, r.config.framework); + try { + await upsertBenchmarkEnvironment(sql, workflowRunId, r.configId, r.image, parsed); + } catch (error: any) { + tracker.recordDbError(`env for ${zipFile}`, error); + } + } } } } catch (error: any) { diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 25525e04..944f3957 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -12,6 +12,7 @@ import { existsSync, readFileSync } from 'node:fs'; import { resolve } from 'node:path'; import type { BenchmarkRow } from './queries/benchmarks.js'; +import type { BenchmarkEnvironment } from './queries/environments.js'; import type { EvalRow } from './queries/evaluations.js'; import type { ReliabilityRow } from './queries/reliability.js'; import type { @@ -273,6 +274,8 @@ function toBenchmarkRow( metrics?: Record, ): BenchmarkRow { return { + workflow_run_id: br.workflow_run_id, + config_id: br.config_id, hardware: c.hardware, framework: c.framework, model: c.model, @@ -583,3 +586,45 @@ export function getServerLog(benchmarkResultId: number): string | null { return s.serverLogs.get(logId) ?? null; } + +// --------------------------------------------------------------------------- +// Environments — feeds /api/v1/run-environment +// --------------------------------------------------------------------------- + +/** + * Older dumps that pre-date the `benchmark_environments` table won't have + * a `benchmark_environments.json` file. In that case `getEnvironmentForRunConfig` + * returns `null`, matching the "no row found" SQL behavior. + */ +export function getEnvironmentForRunConfig( + workflowRunId: number, + configId: number, +): BenchmarkEnvironment | null { + const s = getStore(); + const envPath = resolve(s.dumpDir, 'benchmark_environments.json'); + if (!existsSync(envPath)) return null; + + type RawEnv = BenchmarkEnvironment & { + workflow_run_id: number; + config_id: number; + }; + const raw = JSON.parse(readFileSync(envPath, 'utf8')) as RawEnv[]; + const match = raw.find( + (e) => Number(e.workflow_run_id) === workflowRunId && Number(e.config_id) === configId, + ); + if (!match) return null; + + return { + source: match.source, + image: match.image, + framework_version: match.framework_version, + framework_sha: match.framework_sha, + torch_version: match.torch_version, + python_version: match.python_version, + cuda_version: match.cuda_version, + rocm_version: match.rocm_version, + driver_version: match.driver_version, + gpu_sku: match.gpu_sku, + extra: match.extra ?? {}, + }; +} diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..9a04b9de 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -1,6 +1,14 @@ import type { DbClient } from '../connection.js'; export interface BenchmarkRow { + /** + * Natural-key halves identifying the (run, config) this row was produced + * by. Together they key `benchmark_environments` and any future + * per-(run, config) endpoint. Optional because unofficial-run rows + * (synthesized client-side from GHA artifacts) have no DB row. + */ + workflow_run_id?: number; + config_id?: number; hardware: string; framework: string; model: string; @@ -51,6 +59,8 @@ export async function getLatestBenchmarks( const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.workflow_run_id, + br.config_id, c.hardware, c.framework, c.model, @@ -89,6 +99,8 @@ export async function getLatestBenchmarks( // No date filter: use materialized view for instant lookups const rows = await sql` SELECT + lb.workflow_run_id, + lb.config_id, c.hardware, c.framework, c.model, @@ -136,6 +148,8 @@ export async function getAllBenchmarksForHistory( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT + br.workflow_run_id, + br.config_id, c.hardware, c.framework, c.model, diff --git a/packages/db/src/queries/environments.ts b/packages/db/src/queries/environments.ts new file mode 100644 index 00000000..12f899ce --- /dev/null +++ b/packages/db/src/queries/environments.ts @@ -0,0 +1,48 @@ +import type { DbClient } from '../connection.js'; + +export interface BenchmarkEnvironment { + /** Provenance: 'env_json' = authoritative, 'log_parse' = best-effort fallback. */ + source: 'env_json' | 'log_parse'; + image: string | null; + framework_version: string | null; + framework_sha: string | null; + torch_version: string | null; + python_version: string | null; + cuda_version: string | null; + rocm_version: string | null; + driver_version: string | null; + gpu_sku: string | null; + /** Anything captured that doesn't yet have a dedicated column. */ + extra: Record; +} + +/** + * Fetch the environment row for a (workflow run, config) pair — the natural + * key of `benchmark_environments`. Returns `null` when no row exists + * (e.g. very old data that has never been backfilled). + */ +export async function getEnvironmentForRunConfig( + sql: DbClient, + workflowRunId: number, + configId: number, +): Promise { + const rows = (await sql` + select + be.source, + be.image, + be.framework_version, + be.framework_sha, + be.torch_version, + be.python_version, + be.cuda_version, + be.rocm_version, + be.driver_version, + be.gpu_sku, + be.extra + from benchmark_environments be + where be.workflow_run_id = ${workflowRunId} + and be.config_id = ${configId} + limit 1 + `) as unknown as BenchmarkEnvironment[]; + return rows[0] ?? null; +}