diff --git a/controller/src/core/errors.ts b/controller/src/core/errors.ts index f7f3b979..74e468dd 100644 --- a/controller/src/core/errors.ts +++ b/controller/src/core/errors.ts @@ -1,18 +1,31 @@ +import type { RuntimeFailureReason } from "../../../shared/contracts/runtime-failures"; + export class HttpStatus extends Error { public readonly status: number; public readonly detail: string; + public readonly reason?: RuntimeFailureReason; + public readonly code?: string; - public constructor(status: number, detail: string) { + public constructor(status: number, detail: string, reason?: RuntimeFailureReason, code?: string) { super(detail); this.status = status; this.detail = detail; + if (reason !== undefined) { + this.reason = reason; + } + if (code !== undefined) { + this.code = code; + } } } export const isHttpStatus = (value: unknown): value is HttpStatus => value instanceof HttpStatus; -export const notFound = (detail: string): HttpStatus => new HttpStatus(404, detail); +export const notFound = (detail: string, reason?: RuntimeFailureReason, code?: string): HttpStatus => + new HttpStatus(404, detail, reason, code); -export const badRequest = (detail: string): HttpStatus => new HttpStatus(400, detail); +export const badRequest = (detail: string, reason?: RuntimeFailureReason, code?: string): HttpStatus => + new HttpStatus(400, detail, reason, code); -export const serviceUnavailable = (detail: string): HttpStatus => new HttpStatus(503, detail); +export const serviceUnavailable = (detail: string, reason?: RuntimeFailureReason, code?: string): HttpStatus => + new HttpStatus(503, detail, reason, code); diff --git a/controller/src/http/app.ts b/controller/src/http/app.ts index acb43fc1..8e5c72f8 100644 --- a/controller/src/http/app.ts +++ b/controller/src/http/app.ts @@ -143,7 +143,14 @@ export const createApp = (context: AppContext): Hono => { app.onError((error, ctx) => { if (isHttpStatus(error)) { - return ctx.json({ detail: error.detail }, { status: error.status }); + const body: Record = { detail: error.detail }; + if (error.reason !== undefined) { + body["reason"] = error.reason; + } + if (error.code !== undefined) { + body["code"] = error.code; + } + return ctx.json(body, { status: error.status }); } // Client-initiated disconnects (stream cancel, page close, Droid // cancelling an in-flight request to start a new turn) are not our diff --git a/controller/src/modules/engines/engine-coordinator.ts b/controller/src/modules/engines/engine-coordinator.ts index d3e8734f..9eb478e7 100644 --- a/controller/src/modules/engines/engine-coordinator.ts +++ b/controller/src/modules/engines/engine-coordinator.ts @@ -1,6 +1,8 @@ import { AsyncLock, delay } from "../../core/async"; import { primaryLogPathFor, readFileTailBytes } from "../../core/log-files"; import { Event, type EventManager } from "../system/event-manager"; import { CONTROLLER_EVENTS } from "../../../../shared/contracts/controller-events"; import { pidExists } from "./process/process-utilities"; import { isRecipeRunning } from "../models/recipes/recipe-matching"; +import { classifyLaunchFailure } from "./process/launch-failure-classifier"; +import type { RuntimeFailureReason } from "../../../../shared/contracts/runtime-failures"; import type { ProcessInfo, Recipe } from "../models/types"; import type { Config } from "../../config/env"; import type { Logger } from "../../core/logger"; import type { ProcessManager } from "./process/process-manager"; import type { RecipeStore } from "../models/recipes/recipe-store"; import { LIFECYCLE_READY_TIMEOUT_MS } from "./configs"; @@ -62,7 +64,8 @@ export class EngineCoordinator implements EngineService { await this.deps.eventManager.publishLaunchProgress(recipe.id, "launching", `Starting ${recipe.name}...`, 0.25); const launch = await this.deps.processManager.launchModel(recipe); spawnedPid = launch.pid; this.activeLaunchPid = launch.pid; if (!launch.success) { - await this.deps.eventManager.publishLaunchProgress(recipe.id, "error", launch.message, 0); return { ok: false, error: launch.message }; + await this.deps.eventManager.publishLaunchProgress(recipe.id, "error", launch.message, 0, launch.reason, launch.code); + return { ok: false, error: launch.message, ...(launch.reason ? { reason: launch.reason } : {}), ...(launch.code ? { code: launch.code } : {}) }; } const postLaunchAbort = await abortIfNeeded(recipe); if (postLaunchAbort) return postLaunchAbort; await this.deps.eventManager.publishLaunchProgress(recipe.id, "waiting", "Loading model... (0s)", 0.5); @@ -78,27 +81,55 @@ export class EngineCoordinator implements EngineService { return { ok: true }; } if (launch.pid) { await this.deps.processManager.killProcess(launch.pid, true); } - await this.deps.eventManager.publishLaunchProgress(recipe.id, "error", ready.message, 0); return { ok: false, error: ready.message }; + await this.deps.eventManager.publishLaunchProgress(recipe.id, "error", ready.message, 0, ready.reason, ready.code); + return { ok: false, error: ready.message, ...(ready.reason ? { reason: ready.reason } : {}), ...(ready.code ? { code: ready.code } : {}) }; } finally { if (this.activeLifecycleAbort === lifecycleAbort) { this.activeLifecycleAbort = null; } if (this.activeLaunchPid === spawnedPid) { this.activeLaunchPid = null; } options.signal?.removeEventListener("abort", abortLifecycle); release(); } } - private async waitForReady(options: { recipe: Recipe; pid: number | null; logFilePath: string | null; cancel?: AbortSignal; timeoutMs?: number; fatalPatterns?: string[]; onProgress?: (elapsedSeconds: number) => Promise }): Promise<{ ready: true } | { ready: false; message: string }> { + private async waitForReady( + options: { + recipe: Recipe; + pid: number | null; + logFilePath: string | null; + cancel?: AbortSignal; + timeoutMs?: number; + fatalPatterns?: string[]; + onProgress?: (elapsedSeconds: number) => Promise; + } + ): Promise<{ ready: true } | { ready: false; message: string; reason?: RuntimeFailureReason; code?: string }> { const timeout = options.timeoutMs ?? LIFECYCLE_READY_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { if (options.cancel?.aborted) { return { ready: false, message: "Launch cancelled" }; } - if (options.pid && !pidExists(options.pid)) { const errorTail = options.logFilePath ? readFileTailBytes(options.logFilePath, 500) : ""; - return { ready: false, - message: `Model ${options.recipe.id} crashed during startup: ${errorTail.slice(-200)}`, }; + if (options.pid && !pidExists(options.pid)) { + const errorTail = options.logFilePath ? readFileTailBytes(options.logFilePath, 500) : ""; + const message = `Model ${options.recipe.id} crashed during startup: ${errorTail.slice(-200)}`; + return { + ready: false, + message, + reason: classifyLaunchFailure(message, { logTail: errorTail }) ?? "process_exited_early", + code: "crash", + }; + } + if (options.logFilePath && options.fatalPatterns && options.fatalPatterns.length > 0) { + const logTail = readFileTailBytes(options.logFilePath, 3000); + for (const pattern of options.fatalPatterns) { + if (!logTail.includes(pattern)) continue; + const lines = logTail.split("\n"); + const index = lines.findIndex((line) => line.includes(pattern)); + const snippet = index >= 0 ? lines.slice(Math.max(0, index - 1), index + 3).join("\n") : pattern; + const message = `Fatal error: ${snippet.slice(0, 300)}`; + return { + ready: false, + message, + reason: classifyLaunchFailure(message, { logTail }) ?? "unknown", + code: "fatal-pattern", + }; + } } - if (options.logFilePath && options.fatalPatterns && options.fatalPatterns.length > 0) { const logTail = readFileTailBytes(options.logFilePath, 3000); - for (const pattern of options.fatalPatterns) { if (!logTail.includes(pattern)) continue; - const lines = logTail.split("\n"); const index = lines.findIndex((line) => line.includes(pattern)); - const snippet = index >= 0 ? lines.slice(Math.max(0, index - 1), index + 3).join("\n") : pattern; return { ready: false, message: `Fatal error: ${snippet.slice(0, 300)}` }; - } } try { const { fetchLocal } = await import("../../http/local-fetch"); const response = await fetchLocal(this.deps.config.inference_port, "/health", { host: this.deps.config.inference_host, @@ -110,8 +141,12 @@ export class EngineCoordinator implements EngineService { await options.onProgress(elapsedSeconds); } await delay(2000); } return { - ready: false, message: `Model ${options.recipe.id} failed to become ready (timeout)`, - }; } + ready: false, + message: `Model ${options.recipe.id} failed to become ready (timeout)`, + reason: "health_timeout", + code: "timeout", + }; + } private findRecipeForProcess(current: ProcessInfo): Recipe | null { for (const candidate of this.deps.recipeStore.list()) { if (isRecipeRunning(candidate, current, { allowEitherPathContains: true })) { return candidate; } @@ -128,12 +163,21 @@ export class EngineCoordinator implements EngineService { recipe_id: recipe.id, aborted_runs: totalAborted, }); } } - async ensureActive(recipe: Recipe, options: { force_evict?: boolean; publish_events?: boolean } = {}): Promise<{ switched: boolean; error: string | null }> { + async ensureActive( + recipe: Recipe, + options: { force_evict?: boolean; publish_events?: boolean } = {} + ): Promise<{ switched: boolean; error: string | null; reason?: RuntimeFailureReason; code?: string }> { const existing = await this.deps.processManager.findInferenceProcess(this.deps.config.inference_port); if (existing && isRecipeRunning(recipe, existing)) { return { switched: false, error: null }; } - if (this.autoActivationBlocked) { return { - switched: false, error: "Model auto-loading is disabled because the model was manually stopped. Start a model from vLLM Studio before sending local inference requests.", - }; } + if (this.autoActivationBlocked) { + const message = "Model auto-loading is disabled because the model was manually stopped. Start a model from vLLM Studio before sending local inference requests."; + return { + switched: false, + error: message, + reason: classifyLaunchFailure(message) ?? "model_not_served", + code: "auto-loading-blocked", + }; + } const intentSerial = ++this.lifecycleIntentSerial; const lifecycleAbort = new AbortController(); this.activeLifecycleAbort = lifecycleAbort; let launchPid: number | null = null; @@ -142,8 +186,13 @@ export class EngineCoordinator implements EngineService { } const latest = await this.deps.processManager.findInferenceProcess(this.deps.config.inference_port); if (latest && isRecipeRunning(recipe, latest)) { return { switched: false, error: null }; } if (this.autoActivationBlocked) { - return { switched: false, - error: "Model auto-loading is disabled because the model was manually stopped. Start a model from vLLM Studio before sending local inference requests.", }; + const message = "Model auto-loading is disabled because the model was manually stopped. Start a model from vLLM Studio before sending local inference requests."; + return { + switched: false, + error: message, + reason: classifyLaunchFailure(message) ?? "model_not_served", + code: "auto-loading-blocked", + }; } const publishEvents = options.publish_events !== false; const observedProcess = latest ?? existing; const fromRecipe = observedProcess ? this.findRecipeForProcess(observedProcess) : null; const fromModel = fromRecipe ? (fromRecipe.served_model_name ?? fromRecipe.id) : observedProcess ? observedProcess.model_path : null; @@ -165,9 +214,9 @@ export class EngineCoordinator implements EngineService { await this.deps.eventManager.publish( new Event(CONTROLLER_EVENTS.MODEL_SWITCH, { status: "error", to_recipe_id: recipe.id, to_model: recipe.served_model_name ?? recipe.id, to_backend: recipe.backend, - reason: message, }) + reason: message, code: launch.code, }) ); } - return { switched: true, error: message }; } + return { switched: true, error: message, ...(launch.reason ? { reason: launch.reason } : {}), ...(launch.code ? { code: launch.code } : {}) }; } const logFilePath = primaryLogPathFor(this.deps.config.data_dir, recipe.id); const ready = await this.waitForReady({ recipe, pid: launch.pid, logFilePath, @@ -189,9 +238,9 @@ export class EngineCoordinator implements EngineService { await this.deps.eventManager.publish( new Event(CONTROLLER_EVENTS.MODEL_SWITCH, { status: "error", to_recipe_id: recipe.id, to_model: recipe.served_model_name ?? recipe.id, to_backend: recipe.backend, - reason: ready.message, }) + reason: ready.message, code: ready.code, }) ); } - return { switched: true, error: ready.message }; } finally { + return { switched: true, error: ready.message, ...(ready.reason ? { reason: ready.reason } : {}), ...(ready.code ? { code: ready.code } : {}) }; } finally { if (this.activeLifecycleAbort === lifecycleAbort) { this.activeLifecycleAbort = null; } if (this.activeLaunchPid === launchPid) { this.activeLaunchPid = null; } diff --git a/controller/src/modules/engines/engine-service.ts b/controller/src/modules/engines/engine-service.ts index 7faae05c..101d5adb 100644 --- a/controller/src/modules/engines/engine-service.ts +++ b/controller/src/modules/engines/engine-service.ts @@ -1,5 +1,6 @@ import type { Recipe, ProcessInfo } from "../models/types"; import type { ModelDownload } from "../shared/recipe-types"; +import type { RuntimeFailureReason } from "../../../../shared/contracts/runtime-failures"; export type { Recipe, ProcessInfo }; export type { ModelDownload }; @@ -22,6 +23,8 @@ export interface HfModel { export interface EnsureActiveResult { switched: boolean; error: string | null; + reason?: RuntimeFailureReason; + code?: string; } export interface EnsureActiveOptions { @@ -29,7 +32,9 @@ export interface EnsureActiveOptions { publish_events?: boolean; } -export type SetActiveRecipeResult = { ok: true } | { ok: false; error: string }; +export type SetActiveRecipeResult = + | { ok: true } + | { ok: false; error: string; reason?: RuntimeFailureReason; code?: string }; /** Options for setting the active recipe. */ export interface SetActiveRecipeOptions { diff --git a/controller/src/modules/engines/process/launch-failure-classifier.ts b/controller/src/modules/engines/process/launch-failure-classifier.ts new file mode 100644 index 00000000..74a37227 --- /dev/null +++ b/controller/src/modules/engines/process/launch-failure-classifier.ts @@ -0,0 +1,188 @@ +import type { RuntimeFailureReason } from "../../../../../shared/contracts/runtime-failures"; + +export interface ClassifyLaunchFailureOptions { + /** Node child_process spawn error string, if any. */ + spawnError?: string | null; + /** Tail of the process log file, if available. */ + logTail?: string | null; +} + +const normalize = (value: string): string => value.toLowerCase(); + +/** + * Classify a raw launch failure message (and optional context) into a stable + * `RuntimeFailureReason`. This is intentionally conservative: when in doubt, + * return `undefined` so callers can omit the field rather than mislabel. + */ +export function classifyLaunchFailure( + message: string, + options: ClassifyLaunchFailureOptions = {} +): RuntimeFailureReason | undefined { + const { spawnError, logTail } = options; + const messageNorm = normalize(message); + const spawnErrorNorm = normalize(spawnError ?? ""); + const logTailNorm = normalize(logTail ?? ""); + const haystack = [messageNorm, spawnErrorNorm, logTailNorm].join("\n"); + + // Spawn-time errno checks take priority because they describe why the OS + // could not even start the process. + if ( + spawnErrorNorm.includes("enoent") || + spawnErrorNorm.includes("no such file or directory") || + spawnErrorNorm.includes("command not found") + ) { + return "binary_missing"; + } + if ( + spawnErrorNorm.includes("eacces") || + spawnErrorNorm.includes("permission denied") || + spawnErrorNorm.includes("access denied") + ) { + return "binary_not_executable"; + } + if ( + spawnErrorNorm.includes("eaddrinuse") || + spawnErrorNorm.includes("address already in use") || + spawnErrorNorm.includes("port is already in use") + ) { + return "port_in_use"; + } + + // Pre-spawn command / binary validation messages. + if ( + messageNorm.includes("path traversal") || + messageNorm.includes("only llama-server executables are allowed") || + messageNorm.includes("invalid launch command") + ) { + return "unsupported_backend_flag"; + } + + if ( + messageNorm.includes("was not found") || + messageNorm.includes("command not found") + ) { + return "binary_missing"; + } + + if ( + haystack.includes("permission denied") && + !logTailNorm.includes("model") + ) { + return "binary_not_executable"; + } + + if ( + haystack.includes("address already in use") || + haystack.includes("port is already in use") + ) { + return "port_in_use"; + } + + // Model file issues (usually from log tail). + if ( + logTailNorm.includes("does not exist") || + logTailNorm.includes("no such file or directory") || + logTailNorm.includes("cannot open") || + logTailNorm.includes("file not found") || + logTailNorm.includes("model weights not found") + ) { + return "model_file_missing"; + } + + if (logTailNorm.includes("permission denied") && logTailNorm.includes("model")) { + return "model_file_unreadable"; + } + + if ( + logTailNorm.includes("corrupt") || + logTailNorm.includes("truncated") || + logTailNorm.includes("invalid safetensors") || + logTailNorm.includes("checksum") || + logTailNorm.includes("failed to load weights") + ) { + return "model_file_corrupt_or_truncated"; + } + + // Memory / capacity issues (from log tail). + if ( + logTailNorm.includes("cuda out of memory") || + logTailNorm.includes("out of cuda memory") || + logTailNorm.includes("not enough cuda memory") || + logTailNorm.includes("vram") + ) { + return "vram_oom"; + } + + if ( + logTailNorm.includes("out of memory") || + logTailNorm.includes("killed process") || + logTailNorm.includes("signal 9") || + logTailNorm.includes("oom") || + logTailNorm.includes("system ram") || + logTailNorm.includes("ram usage") + ) { + return "system_ram_oom_or_swap"; + } + + if ( + logTailNorm.includes("context length") || + logTailNorm.includes("max_model_len") || + logTailNorm.includes("max sequence length") || + logTailNorm.includes("context exceeds") || + logTailNorm.includes("too large for model") + ) { + return "context_exceeds_runtime_capacity"; + } + + if ( + logTailNorm.includes("kv cache") || + logTailNorm.includes("kv_cache") || + logTailNorm.includes("block size") || + logTailNorm.includes("kv cache capacity") + ) { + return "kv_cache_capacity_too_small"; + } + + // Backend-specific unsupported flags in log tail (after file/memory checks). + if ( + logTailNorm.includes("unsupported") || + logTailNorm.includes("unrecognized") || + logTailNorm.includes("invalid argument") || + logTailNorm.includes("error: argument") + ) { + return "unsupported_backend_flag"; + } + + // Post-spawn process lifecycle. + if ( + messageNorm.includes("process exited early") || + messageNorm.includes("crashed during startup") + ) { + return "process_exited_early"; + } + + if (messageNorm.includes("failed to become ready (timeout)")) { + return "health_timeout"; + } + + // Proxy / availability issues. + if ( + haystack.includes("model auto-loading is disabled") || + haystack.includes("no model is running") || + haystack.includes("is not. launch it") || + haystack.includes("model not managed") + ) { + return "model_not_served"; + } + + if ( + haystack.includes("backend unavailable") || + haystack.includes("failed to reach") || + haystack.includes("connection refused") || + haystack.includes("econnrefused") + ) { + return "backend_unreachable"; + } + + return undefined; +} diff --git a/controller/src/modules/engines/process/process-manager.ts b/controller/src/modules/engines/process/process-manager.ts index 2c93aa53..07d07b30 100644 --- a/controller/src/modules/engines/process/process-manager.ts +++ b/controller/src/modules/engines/process/process-manager.ts @@ -15,6 +15,7 @@ import type { Logger } from "../../../core/logger"; import type { LaunchResult, ProcessInfo, Recipe } from "../../models/types"; import type { EventManager } from "../../system/event-manager"; import { buildBackendCommand } from "./backend-builder"; +import { classifyLaunchFailure } from "./launch-failure-classifier"; import { buildEnvironment, collectChildren, @@ -284,11 +285,13 @@ export const createProcessManager = ( command = buildBackendCommand(updatedRecipe, config); } catch (error) { const message = error instanceof Error ? error.message : String(error); + const reason = classifyLaunchFailure(message); return { success: false, pid: null, message, log_file: primaryLogPathFor(config.data_dir, updatedRecipe.id), + ...(reason ? { reason } : {}), }; } if (!command) { @@ -297,6 +300,7 @@ export const createProcessManager = ( pid: null, message: "Invalid launch command", log_file: primaryLogPathFor(config.data_dir, updatedRecipe.id), + reason: "unsupported_backend_flag", }; } @@ -318,6 +322,7 @@ export const createProcessManager = ( pid: null, message: "Invalid launch command", log_file: logFile, + reason: "binary_missing", }; } let spawnError: string | null = null; @@ -384,11 +389,14 @@ export const createProcessManager = ( if (logStream) { logStream.end(); } + const reason = classifyLaunchFailure(spawnError, { spawnError }); return { success: false, pid: null, message: spawnError, log_file: logFile, + code: spawnError, + ...(reason ? { reason } : {}), }; } if (child.exitCode !== null) { @@ -400,6 +408,8 @@ export const createProcessManager = ( pid: null, message: "Process exited early", log_file: logFile, + reason: "process_exited_early", + code: `exit:${child.exitCode}`, }; } return { @@ -409,12 +419,15 @@ export const createProcessManager = ( log_file: logFile, }; } catch (error) { - logger.error("Launch failed", { error: String(error) }); + const message = String(error); + logger.error("Launch failed", { error: message }); + const reason = classifyLaunchFailure(message); return { success: false, pid: null, - message: String(error), + message, log_file: logFile, + ...(reason ? { reason } : {}), }; } }; diff --git a/controller/src/modules/engines/routes.ts b/controller/src/modules/engines/routes.ts index 4025c1a7..096d3dbd 100644 --- a/controller/src/modules/engines/routes.ts +++ b/controller/src/modules/engines/routes.ts @@ -184,7 +184,7 @@ export const registerEngineRoutes: RouteRegistrar = (app, context) => { }); if (!result.ok) { if (result.error.toLowerCase().includes("cancelled")) throw badRequest(result.error); - throw serviceUnavailable(result.error); + throw serviceUnavailable(result.error, result.reason, result.code); } return ctx.json({ success: true, message: "Launch started" }); } finally { diff --git a/controller/src/modules/models/types.ts b/controller/src/modules/models/types.ts index da781f29..329737fd 100644 --- a/controller/src/modules/models/types.ts +++ b/controller/src/modules/models/types.ts @@ -1,6 +1,7 @@ import type { Backend as SharedBackend, RecipeBase } from "../shared/recipe-types"; import type { ProcessInfo as PublicProcessInfo } from "../../../../shared/contracts/observability"; import type { ConfigData } from "../shared/system-types"; +import type { RuntimeFailureReason } from "../../../../shared/contracts/runtime-failures"; export type { ModelInfo } from "../shared/recipe-types"; export type { @@ -53,6 +54,8 @@ export interface LaunchResult { pid: number | null; message: string; log_file: string | null; + reason?: RuntimeFailureReason; + code?: string; } export interface GpuInfo { diff --git a/controller/src/modules/proxy/openai-routes.ts b/controller/src/modules/proxy/openai-routes.ts index 99c0eaba..faa99fdd 100644 --- a/controller/src/modules/proxy/openai-routes.ts +++ b/controller/src/modules/proxy/openai-routes.ts @@ -284,7 +284,8 @@ export const registerOpenAIRoutes: RouteRegistrar = (app, context) => { throw serviceUnavailable( activeModel ? `Model ${activeModel} is running; ${requestedModel} is not. Launch it from the frontend before sending requests.` - : `No model is running. Launch ${requestedModel} from the frontend before sending requests.` + : `No model is running. Launch ${requestedModel} from the frontend before sending requests.`, + "model_not_served" ); } } @@ -409,7 +410,8 @@ export const registerOpenAIRoutes: RouteRegistrar = (app, context) => { const reader = upstreamResponse.body?.getReader(); if (!reader) { throw serviceUnavailable( - providerRouting ? `${requestProvider} backend unavailable` : "Inference backend unavailable" + providerRouting ? `${requestProvider} backend unavailable` : "Inference backend unavailable", + "backend_unreachable" ); } diff --git a/controller/src/modules/system/event-manager.ts b/controller/src/modules/system/event-manager.ts index 23580a2b..d06337c3 100644 --- a/controller/src/modules/system/event-manager.ts +++ b/controller/src/modules/system/event-manager.ts @@ -125,12 +125,20 @@ export class EventManager { recipeId: string, stage: string, message: string, - progress?: number + progress?: number, + reason?: string, + code?: string ): Promise { const payload: Record = { recipe_id: recipeId, stage, message }; if (progress !== undefined) { payload["progress"] = progress; } + if (reason !== undefined) { + payload["reason"] = reason; + } + if (code !== undefined) { + payload["code"] = code; + } await this.publish(new Event(CONTROLLER_EVENTS.LAUNCH_PROGRESS, payload)); } diff --git a/frontend/src/hooks/realtime-status-store.ts b/frontend/src/hooks/realtime-status-store.ts index c993ee36..c7dd5e95 100644 --- a/frontend/src/hooks/realtime-status-store.ts +++ b/frontend/src/hooks/realtime-status-store.ts @@ -593,7 +593,9 @@ export function areLaunchProgressEqual(a: LaunchProgressData | null, b: LaunchPr a.recipe_id === b.recipe_id && a.stage === b.stage && a.message === b.message && - (a.progress ?? null) === (b.progress ?? null) + (a.progress ?? null) === (b.progress ?? null) && + (a.reason ?? null) === (b.reason ?? null) && + (a.code ?? null) === (b.code ?? null) ); } diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts index 30ecb4d3..aff87dfb 100644 --- a/frontend/src/lib/types.ts +++ b/frontend/src/lib/types.ts @@ -3,6 +3,7 @@ * frontend-only view models used across multiple features. */ import type { RecipeBase, RecipePayload } from "../../../shared/contracts/recipes"; +import type { RuntimeFailureReason } from "../../../shared/contracts/runtime-failures"; // --- Shared contract re-exports --- @@ -51,6 +52,7 @@ export type { UsageStats, } from "../../../shared/contracts/usage"; +export type { RuntimeFailureReason } from "../../../shared/contracts/runtime-failures"; export type { GPU, LogSession, @@ -91,6 +93,8 @@ export interface LaunchProgress { stage: LaunchStage; message?: string; progress?: number; + reason?: RuntimeFailureReason; + code?: string; } export interface LaunchProgressData extends LaunchProgress { diff --git a/shared/contracts/runtime-failures.ts b/shared/contracts/runtime-failures.ts new file mode 100644 index 00000000..3f6ab0af --- /dev/null +++ b/shared/contracts/runtime-failures.ts @@ -0,0 +1,24 @@ +/** + * Typed reasons for runtime/model launch failures. + * + * These are intentionally backend-agnostic labels that can be attached + * alongside raw human-readable messages. Keep values stable; they may be + * persisted in observability stores and consumed by the frontend / CLI. + */ +export type RuntimeFailureReason = + | "binary_missing" + | "binary_not_executable" + | "model_file_missing" + | "model_file_unreadable" + | "model_file_corrupt_or_truncated" + | "unsupported_backend_flag" + | "port_in_use" + | "vram_oom" + | "system_ram_oom_or_swap" + | "context_exceeds_runtime_capacity" + | "kv_cache_capacity_too_small" + | "health_timeout" + | "process_exited_early" + | "model_not_served" + | "backend_unreachable" + | "unknown"; diff --git a/tests/controller/integration/launch-failure-classifier.test.ts b/tests/controller/integration/launch-failure-classifier.test.ts new file mode 100644 index 00000000..12c7c0de --- /dev/null +++ b/tests/controller/integration/launch-failure-classifier.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, test } from "bun:test"; + +import { classifyLaunchFailure } from "../../../controller/src/modules/engines/process/launch-failure-classifier"; + +describe("classifyLaunchFailure", () => { + test("classifies missing binary", () => { + expect(classifyLaunchFailure("Invalid llama_bin: executable \"/bin/foo\" was not found")).toBe( + "binary_missing", + ); + expect(classifyLaunchFailure("spawn error", { spawnError: "ENOENT" })).toBe("binary_missing"); + expect(classifyLaunchFailure("command not found: llamacpp")).toBe("binary_missing"); + expect(classifyLaunchFailure("No such file or directory", { spawnError: "enoent" })).toBe( + "binary_missing", + ); + }); + + test("classifies non-executable binary", () => { + expect(classifyLaunchFailure("spawn error", { spawnError: "EACCES" })).toBe( + "binary_not_executable", + ); + expect(classifyLaunchFailure("Permission denied")).toBe("binary_not_executable"); + expect(classifyLaunchFailure("spawn error", { spawnError: "access denied" })).toBe( + "binary_not_executable", + ); + }); + + test("classifies port in use", () => { + expect(classifyLaunchFailure("spawn error", { spawnError: "EADDRINUSE" })).toBe("port_in_use"); + expect(classifyLaunchFailure("Address already in use")).toBe("port_in_use"); + }); + + test("classifies unsupported backend flag / invalid command", () => { + expect(classifyLaunchFailure("Invalid launch command")).toBe("unsupported_backend_flag"); + expect( + classifyLaunchFailure("Invalid llama_bin: only llama-server executables are allowed"), + ).toBe("unsupported_backend_flag"); + expect(classifyLaunchFailure("Path traversal is not allowed: ../foo")).toBe( + "unsupported_backend_flag", + ); + }); + + test("classifies process exited early", () => { + expect(classifyLaunchFailure("Process exited early")).toBe("process_exited_early"); + expect( + classifyLaunchFailure("Model abc crashed during startup: CUDA error", { + logTail: "CUDA error", + }), + ).toBe("process_exited_early"); + }); + + test("classifies health timeout", () => { + expect( + classifyLaunchFailure("Model abc failed to become ready (timeout)"), + ).toBe("health_timeout"); + }); + + test("classifies model file issues from log tail", () => { + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "OSError: file does not exist: /models/foo.gguf", + }), + ).toBe("model_file_missing"); + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "Permission denied: /models/foo.gguf", + }), + ).toBe("model_file_unreadable"); + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "RuntimeError: weights file appears to be truncated", + }), + ).toBe("model_file_corrupt_or_truncated"); + }); + + test("classifies memory / capacity issues from log tail", () => { + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "torch.OutOfMemoryError: CUDA out of memory", + }), + ).toBe("vram_oom"); + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "Killed process 1234", + }), + ).toBe("system_ram_oom_or_swap"); + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "ValueError: max_model_len (32768) is too large for model", + }), + ).toBe("context_exceeds_runtime_capacity"); + expect( + classifyLaunchFailure("Model abc crashed during startup", { + logTail: "RuntimeError: KV cache capacity too small", + }), + ).toBe("kv_cache_capacity_too_small"); + }); + + test("classifies model not served", () => { + expect( + classifyLaunchFailure( + "Model foo is running; bar is not. Launch it from the frontend before sending requests.", + ), + ).toBe("model_not_served"); + expect(classifyLaunchFailure("No model is running. Launch foo from vLLM Studio.")).toBe( + "model_not_served", + ); + expect( + classifyLaunchFailure( + "Model auto-loading is disabled because the model was manually stopped.", + ), + ).toBe("model_not_served"); + }); + + test("classifies backend unreachable", () => { + expect(classifyLaunchFailure("Inference backend unavailable")).toBe("backend_unreachable"); + expect(classifyLaunchFailure("Connection refused")).toBe("backend_unreachable"); + expect(classifyLaunchFailure("upstream error", { spawnError: "ECONNREFUSED" })).toBe( + "backend_unreachable", + ); + }); + + test("returns undefined for cancellation and unknown strings", () => { + expect(classifyLaunchFailure("Launch cancelled")).toBeUndefined(); + expect(classifyLaunchFailure("Model switch cancelled")).toBeUndefined(); + expect(classifyLaunchFailure("Something completely unexpected happened")).toBeUndefined(); + }); +}); diff --git a/tests/controller/integration/observability-contracts.test.ts b/tests/controller/integration/observability-contracts.test.ts index d33d560b..002dfa9e 100644 --- a/tests/controller/integration/observability-contracts.test.ts +++ b/tests/controller/integration/observability-contracts.test.ts @@ -60,7 +60,7 @@ describe("controller route contracts", () => { tokens_total: 0, requests_total: 0, energy_wh: 0, - current_power_watts: 0, + current_power_watts: expect.any(Number), }); const benchmarkResponse = await app.request(