From 53b509e5b0750c16511f82b22ac7a49365c5e410 Mon Sep 17 00:00:00 2001 From: vi Date: Mon, 1 Jun 2026 22:00:35 +0300 Subject: [PATCH 1/8] feat(proxy): implement LongCat session ban & fallback Implements the LongCat session ban feature in proxy.ts. When LongCat detects multiple API key use (auth/rate-limit errors) or returns truncated responses, the platform is banned from the sticky session. Future requests in that session route to non-LongCat models. Changes: - Extend stickySessionMap with bannedPlatforms?: Set - Add isSessionBannedFromPlatform() to check session bans - Add banPlatformFromSession() to record platform bans - Add addLongcatModelsToSkipModels() to skip all LongCat models - Add isTruncatedResponse() to detect truncation keywords - Update getStickyKey() to return undefined for banned platforms - Update setStickyModel() to preserve bannedPlatforms across updates - Update pre-routing logic to check bans before routing - Update error handling to ban LongCat on auth/rate-limit/truncation - Add truncation detection after stream completes - Add truncation detection in mid-stream error handling --- .roo/specs/longcat-session-ban/design.md | 484 ++++++++++++++++++ .../specs/longcat-session-ban/requirements.md | 28 + .roo/specs/longcat-session-ban/tasks.md | 113 ++++ server/src/routes/proxy.ts | 154 +++++- 4 files changed, 772 insertions(+), 7 deletions(-) create mode 100644 .roo/specs/longcat-session-ban/design.md create mode 100644 .roo/specs/longcat-session-ban/requirements.md create mode 100644 .roo/specs/longcat-session-ban/tasks.md diff --git a/.roo/specs/longcat-session-ban/design.md b/.roo/specs/longcat-session-ban/design.md new file mode 100644 index 00000000..4e566890 --- /dev/null +++ b/.roo/specs/longcat-session-ban/design.md @@ -0,0 +1,484 @@ +# Design: LongCat Session Ban & Fallback + +## Architecture Overview + +The ban mechanism extends the existing sticky session infrastructure in `proxy.ts` and integrates with the retry loop in `handleChatCompletion()`. The router (`router.ts`) requires minimal changes — only the LongCat smart-auto preference needs to respect session bans. All ban detection and session management happens in the proxy layer. + +```mermaid +graph TD + subgraph Proxy [proxy.ts] + SSM[stickySessionMap
key → modelDbId + keyId + bannedPlatforms + lastUsed] + GSM[getStickyModel] + GSK[getStickyKey] + GSB[isSessionBannedFromPlatform] + BPS[banPlatformFromSession] + SSM2[setStickyModel] + CSM[clearStickyModel] + CSK[clearStickyKey] + ALM[addLongcatModelsToSkipModels] + IT[isTruncatedResponse] + end + + subgraph Router [router.ts] + RR[routeRequest] + LCP[LongCat Smart-Auto Preference] + end + + subgraph RetryLoop [handleChatCompletion retry loop] + DET[Error Detection] + BAN[Ban Recording] + SKIP[Add skipModels] + FALL[Fallback Routing] + end + + SSM --> GSM + SSM --> GSK + SSM --> GSB + BPS --> SSM + DET --> BAN + BAN --> SSM + BAN --> SKIP + SKIP --> RR + RR --> LCP + LCP -->|skip if banned| FALL +``` + +## Data Model Changes + +### Sticky Session Map Value Type + +Current value type at [`proxy.ts:16`](server/src/routes/proxy.ts:16): +```typescript +{ modelDbId: number; keyId?: number; lastUsed: number } +``` + +Extended to: +```typescript +{ modelDbId: number; keyId?: number; bannedPlatforms?: Set; lastUsed: number } +``` + +The `bannedPlatforms` field is optional for backward compatibility. Existing entries without it default to anundefined` (no bans). Non-LongCat sessions never have bans. + +## New Functions + +### 1. `isSessionBannedFromPlatform()` — [`proxy.ts`](server/src/routes/proxy.ts) + +Checks whether a sticky session is banned from a specific platform. Used before passing `preferredKeyId` to the router and before deciding whether to skip LongCat models. + +```typescript +function isSessionBannedFromPlatform( + messages: ChatMessage[], + routingMode: RoutingMode, + platform: string, +): boolean { + const key = getSessionKey(messages, routingMode); + if (!key) return false; + const entry = stickySessionMap.get(key); + if (!entry) return false; + if (Date.now() - entry.lastUsed > STICKY_TTL_MS) return false; // expired = no ban + return entry.bannedPlatforms?.has(platform) ?? false; +} +``` + +### 2. `banPlatformFromSession()` — [`proxy.ts`](server/src/routes/proxy.ts) + +Records a platform ban in the sticky session. Called when truncation, auth, or rate-limit errors is detected on LongCat. + +```typescript +function banPlatformFromSession( + messages: ChatMessage[], + routingMode: RoutingMode, + platform: string, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (!entry.bannedPlatforms) entry.bannedPlatforms = new Set(); + entry.bannedPlatforms.add(platform); + entry.lastUsed = Date.now(); // refresh TTL so the ban persists + stickySessionMap.set(key, entry); + console.log(`[Sticky] banned platform=${platform} for session=${key.slice(0, 8)} | bannedPlatforms=${Array.from(entry.bannedPlatforms).join(',')}`); +} +``` + +### 3. `addLongcatModelsToSkipModels()` — [`proxy.ts`](server/src/routes/proxy.ts) + +Helper that queries the DB for all LongCat model_db_ids and adds them to the `skipModels` set. Called when a session is banned from LongCat. + +```typescript +function addLongcatModelsToSkipModels(skipModels: Set): void { + const db = getDb(); + const longcatModels = db.prepare( + 'SELECT id FROM models WHERE platform = ? AND enabled = 1' + ).all('longcat') as Array<{ id: number }>; + for (const m of longcatModels) { + skipModels.add(m.id); + } + console.log(`[Sticky] added ${longcatModels.length} longcat model(s) to skipModels: [${longcatModels.map(m => m.id).join(',')}]`); +} +``` + +### 4. `isTruncatedResponse()` — [`proxy.ts`](server/src/routes/proxy.ts) + +Detects whether a response (streamed text or error message) indicates truncation by the LongCat provider. Checks for known truncation keywords in error messages and response content. + +```typescript +function isTruncatedResponse(errOrContent: any): boolean { + if (!errOrContent) return false; + const str = String(errOrContent).toLowerCase(); + // Truncation indicators from LongCat and similar providers + return str.includes('truncated') + || str.includes('truncation') + || str.includes('conflict') + || str.includes('context_length_exceeded') + || str.includes('token_limit') + || str.includes('maximum length') + || str.includes('response_length_limit'); +} +``` + +## Component Changes + +### 1. Sticky Session Map Type — [`proxy.ts:16`](server/src/routes/proxy.ts:16) + +```typescript +const stickySessionMap = new Map; + lastUsed: number; +}>(); +``` + +### 2. `getStickyKey()` Update — [`proxy.ts:54-79`](server/src/routes/proxy.ts:54-79) + +Add a check: if the session is banned from the model's platform, return `undefined` instead of the sticky key. This prevents the proxy from passing a `preferredKeyId` for a banned session. + +```typescript +function getStickyKey(messages: ChatMessage[], routingMode: RoutingMode): number | undefined { + const key = getSessionKey(messages, routingMode); + if (!key) { ... return undefined; } + + const entry = stickySessionMap.get(key); + if (!entry) { ... return undefined; } + + // TTL check (existing) + + // NEW: If session is banned from the sticky model's platform, don't return sticky key + if (entry.bannedPlatforms) { + const db = getDb(); + const modelRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(entry.modelDbId) as { platform: string } | undefined; + if (modelRow && entry.bannedPlatforms.has(modelRow.platform)) { + console.log(`[Sticky] key skipped session=${key.slice(0, 8)} | model platform=${modelRow.platform} is banned`); + return undefined; + } + } + + // Existing keyId return logic + if (entry.keyId !== undefined) { ... } + return entry.keyId; +} +``` + +### 3. `clearStickyModel()` Update — [`proxy.ts:81-87`](server/src/routes/proxy.ts:81-87) + +When clearing a sticky model, also clear `bannedPlatforms` since the entire session entry is being removed. No change needed — `clearStickyModel()` deletes the entire map entry, which naturally clears `bannedPlatforms` too. + +### 4. `setStickyModel()` Update — [`proxy.ts:100-112`](server/src/routes/proxy.ts:100-112) + +When setting a new sticky model after successful fallback, preserve the `bannedPlatforms` set from the previous entry (if any). This ensures the LongCat ban persists even when the sticky model changes. + +```typescript +function setStickyModel(messages: ChatMessage[], modelDbId: number, routingMode: RoutingMode, keyId?: number) { + const key = getSessionKey(messages, routingMode); + if (!key) return; + + // Preserve bannedPlatforms from existing entry (if session was previously banned) + const existing = stickySessionMap.get(key); + const bannedPlatforms = existing?.bannedPlatforms; + + stickySessionMap.set(key, { modelDbId, keyId, bannedPlatforms, lastUsed: Date.now() }); + console.log(`[Sticky] set key=${key.slice(0, 8)} | msgs=${messages.length} → modelDbId=${modelDbId}${keyId !== undefined ? ` keyId=${keyId}` : ''}${bannedPlatforms && bannedPlatforms.size > 0 ? ` banned=${Array.from(bannedPlatforms).join(',')}` : ''}`); + + // Existing eviction logic unchanged +} +``` + +### 5. `handleChatCompletion()` Retry Loop — [`proxy.ts:1035-1282`](server/src/routes/proxy.ts:1035-1282) + +The retry loop needs several changes: + +#### A. Pre-routing: Check session bans and [`proxy.ts:1035-1053`](server/src/routes/proxy.ts:1035-1053) + +After determining `preferredModel` and `preferredKeyId`, check if the session is banned from LongCat. If banned: +- Skip LongCat models in routing +- Don't pass `preferredKeyId` for LongCat + +```typescript +// Existing: determine preferredModel and preferredKeyId +let preferredKeyId: number | undefined; +if (preferredModel && !requestedModel) { + const stickyKeyId = getStickyKey(normalizedMessages, routingMode); + if (stickyKeyId !== undefined) { + const db = getDb(); + const row = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (row?.platform === 'longcat') { + preferredKeyId = stickyKeyId; + console.log(`[Sticky] key preferred modelDbId=${preferredModel} keyId=${preferredKeyId} (longcat)`); + } + } +} + +// NEW: Check if session is banned from LongCat +const skipModels = new Set(); +if (isSessionBannedFromPlatform(normalizedMessages, routingMode, 'longcat')) { + addLongcatModelsToSkipModels(skipModels); + // Also clear preferredModel if it points to a LongCat model + if (preferredModel) { + const db = getDb(); + const row = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (row?.platform === 'longcat') { + console.log(`[Sticky] skipping preferredModel=${preferredModel} (longcat banned for session)`); + preferredModel = undefined; + preferredKeyId = undefined; + } + } +} +``` + +#### B. Error handling in retry loop [`proxy.ts:1245-1282`](server/src/routes/proxy.ts:1245-1282) + +When an error occurs on a LongCat route, detect truncation and auth, and rate-limit errors and ban LongCat for the session: + +```typescript +} catch (err: any) { + const latency = Date.now() - start; + logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, 0, latency, null, err.message); + + // NEW: Detect LongCat multiple-key-use errors and ban the platform + if (route.platform === 'longcat') { + // Auth error: different key used for same session + if (isAuthError(err)) { + console.warn(`[Proxy] LongCat auth error — banning longcat for session (multiple key use detected)`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + // Don't clear the entire sticky model — just ban LongCat specifically + // The sticky model will be updated on next successful fallback + } + // Rate-limit error: also indicates key rotation + if (isRateLimitError(err)) { + console.warn(`[Proxy] LongCat rate-limit error — banning longcat for session (key rotation detected)`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + } + // Truncated response: provider cut off the session + if (isTruncatedResponse(err.message) || isTruncatedResponse(err?.responseBody)) { + console.warn(`[Proxy] LongCat truncated response — banning longcat for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + } + } + + // Existing: clear sticky key on auth error (for non-LongCat too) + if (isAuthError(err) && route.platform !== 'longcat') { + clearStickyKey(normalizedMessages, routingMode); + preferredKeyId = undefined; + } + + if (isRetryableError(err)) { + const skipId = `${route.platform}:${route.modelId}:${route.keyId}`; + skipKeys.add(skipId); + if (shouldSkipModelOnRetry(err)) { + skipModels.add(route.modelDbId); + } + if (isRateLimitError(err)) { + setCooldown(route.platform, route.modelId, route.keyId, 120_000); + } + lastError = err; + console.warn(`[Proxy] retryable ${summarizeProviderError(err)} from ${route.displayName}/${route.modelId}, fallback (attempt ${attempt + 1}/${MAX_RETRIES})`); + continue; + } + + // Non-retryable error + clearStickyModel(normalizedMessages, routingMode); + res.status(502).json({ ... }); + return; +} +``` + +### 6. Streaming Truncation Detection — [`proxy.ts:1094-1184`](server/src/routes/proxy.ts:1094-1184) + +After the streaming `for await` loop completes, check the accumulated `streamedText` for truncation indicators. If detected, ban LongCat for the session. The stream has already been sent to the client — no retry within the same request. + +```typescript +// After the for-await loop completes (after line 1133) +// ... existing stream completion logic ... + +// NEW: Check for truncated response content after stream completes +if (route.platform === 'longcat' && isTruncatedResponse(streamedText)) { + console.warn(`[Proxy] LongCat truncated stream content detected — banning longcat for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + // Note: the stream has already been sent to the client + // The truncated response stands as-is — the client received it, just incomplete + // Future requests in this session will route to non-LongCat models +} + +// Continue with existing success path (recordTokens, setStickyModel, etc.) +``` + +For the Responses API streaming path, check `responseStreamContext.outputText` instead of `streamedText`. + +### 7. Mid-Stream Error Handling — [`proxy.ts:1185-1214`](server/src/routes/proxy.ts:1185-1214) + +When a mid-stream error occurs on LongCat, check if it's a truncation-related error. If yes, end the stream gracefully and record the ban, and return. The client receives the truncated response. If not it's not a truncation error, keep existing behavior (send error SSE event and return). + +```typescript +} catch (streamErr: any) { + if (streamStarted) { + // NEW: Check for LongCat truncation error mid-stream + if (route.platform === 'longcat' && isTruncatedResponse(streamErr.message)) { + console.warn(`[Proxy] LongCat truncation error mid-stream — banning longcat for session, ending stream gracefully`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + // End the stream gracefully — client receives truncated response + // Don't send error SSE event — just end the stream + try { + if (responseStreamContext) { + writeResponseStreamEvent(res, { type: 'response.completed', response: { ... status: 'completed' ... } }); + } else { + res.write('data: [DONE]\n\n'); + } + res.end(); + } catch { /* socket gone */ } + logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, totalOutputTokens, Date.now() - start, ttfbMs, streamErr.message); + return; // Stream ended gracefully, client got truncated response + } + + // Existing mid-stream error handling for non-truncation errors + console.error(`[Proxy] Mid-stream error from ${route.displayName}:`, streamErr.message); + const payload = { error: { message: `Provider error (${route.displayName}): stream interrupted`, type: 'stream_error' } }; + try { + if (responseStreamContext) { + writeResponseStreamEvent(res, { ... }); + } else { + res.write(`data: ${JSON.stringify(payload)}\n\n`); + res.write('data: [DONE]\n\n'); + } + res.end(); + } catch { /* socket gone */ } + logRequest(...); + return; + } + // Pre-stream error — bubble to outer retry/502 handler. + throw streamErr; +} +``` + +### 8. Router LongCat Smart-Auto Preference — [`router.ts:498-527`](server/src/services/router.ts:498-527) + +The LongCat smart-auto preference in `routeRequest()` should skip boosting LongCat entries for sessions that are banned from LongCat. The proxy passes `skipModels` containing all LongCat model IDs, so the router naturally skips them. However, the LongCat boost logic at lines 498-527 should also be suppressed when all LongCat entries are in `skipModels`, to avoid unnecessary DB queries. + +No change needed — the existing boost logic already checks `hasCapacity` by querying LongCat keys. If all LongCat models are in `skipModels`, the `for (const entry of sorted)` loop at line 538 will skip them via `if (skipModels?.has(entry.model_db_id)) continue;`. The boost logic at lines 498-527 moves LongCat entries to the front, but they'll be skipped in the main loop anyway. The only optimization would be to skip the boost entirely when LongCat is banned, but this is a minor performance concern, not a functional one. + +**Decision: No router changes needed.** The `skipModels` set passed from the proxy effectively suppresses LongCat routing. The boost logic is harmless when LongCat is banned because the boosted entries are skipped in the main loop. + +## Error Detection Flow + +```mermaid +flowchart TD + ERR[Provider Error] --> TYPE{Error Type?} + TYPE --> |Auth 401/403?` --> BAN1[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE --> `Rate Limit 429?` --> BAN2[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE --> `Truncated Response?` --> BAN3[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE --> `Other Error?` --> EXIST[Existing Retry Behavior] + + BAN1 --> RETRY[Continue Retry Loop - Fall Back to Next Model] + BAN2 --> RETRY + BAN3 --> RETRY + + subgraph Stream Detection + STREAM[Stream Comple] --> CHECK{Check streamedText for Truncation} + CHECK -->|Truncated|` BAN4[Ban LongCat + Record for Future Requests] + CHECK -->|Not Truncated|` SUCCESS[Normal Success Path] + MID[Mid-Stream Error] --> CHECK2{Check Error Message for Truncation} + CHECK2 -->|Truncated|` BAN5[Ban LongCat + End Stream Gracefully + Return] + CHECK2 -->|Not Truncated|` ERRSEND[Send Error SSE Event + Return] + end +``` + +## Session Lifecycle + +```mermaid +sequenceDiagram + participant Client + participant Proxy + participant Router + participant LongCat + participant OtherProvider + + Note over Client,Proxy: Request 1 - auto-smart routing + Client->>Proxy: POST /v1/chat/completions + Proxy->>Router: routeRequest - preferredModel=undefined - skipModels=empty + Router->>Proxy: RouteResult - LongCat - keyId=1 + Proxy->>LongCat: POST /chat/completions - key1 + LongCat->>Proxy: 200 - response + Proxy->>Client: 200 - response + Proxy->>Proxy: setStickyModel - modelDbId=LC - keyId=1 + + Note over Client,Proxy: Request 2 - same session - sticky key fails + Client->>Proxy: POST /v1/chat/completions + Proxy->>Proxy: getStickyModel - modelDbId=LC + Proxy->>Proxy: getStickyKey - keyId=1 + Proxy->>Proxy: isSessionBannedFromPlatform - longcat? NO + Proxy->>Router: routeRequest - preferredModel=LC - preferredKeyId=1 + Router->>Proxy: RouteResult - LongCat - keyId=1 + Proxy->>LongCat: POST /chat/completions - key1 + LongCat->>Proxy: 401 - auth error + Proxy->>Proxy: banPlatformFromSession - longcat + Proxy->>Proxy: addLongcatModelsToSkipModels + Proxy->>Proxy: clearStickyKey + preferredKeyId=undefined + Proxy->>Router: routeRequest - skipModels=LC models - preferredModel=undefined + Router->>Proxy: RouteResult - OtherProvider - keyId=2 + OtherProvider->>Proxy: 200 - response + Proxy->>Client: 200 - response + Proxy->>Proxy: setStickyModel - modelDbId=OP - keyId=2 - bannedPlatforms=longcat + + Note over Client,Proxy: Request 3 - same session - LongCat banned + Client->>Proxy: POST /v1/chat/completions + Proxy->>Proxy: getStickyModel - modelDbId=OP + Proxy->>Proxy: isSessionBannedFromPlatform - longcat? YES + Proxy->>Proxy: addLongcatModelsToSkipModels + Proxy->>Router: routeRequest - preferredModel=OP - skipModels=LC models + Router->>Proxy: RouteResult - OtherProvider - keyId=2 + OtherProvider->>Proxy: 200 - response + Proxy->>Client: 200 - response +``` + +## Edge Cases + +### EC-1: Session Expires +When a sticky session expires via TTL (30 min), the `bannedPlatforms` set is also cleared. This is natural — expired sessions are evicted from `stickySessionMap` entirely, including all associated data. + +### EC-2: No Sticky Session Exists +For a new session with no sticky entry, `isSessionBannedFromPlatform()` returns `false`. No LongCat models are skipped. The request is routed normally via Thompson Sampling or smart-auto preference. + +### EC-3: Non-LongCat Session +A session that was never routed to LongCat has no `bannedPlatforms` entry (or an empty set). `isSessionBannedFromPlatform('longcat')` returns `false`. No changes to existing behavior. + +### EC-4: All LongCat Keys Disabled/Invalid +If all LongCat API keys are disabled or marked invalid, the router skips LongCat naturally via the existing key availability check. No ban is recorded because the session was never routed to LongCat in the first place. + +### EC-5: Server Restart +Sticky sessions are in-memory only. On server restart, all session data (including bans) is lost. This is existing behavior — sticky sessions don't persist across restarts. + +### EC-6: Multiple LongCat Models +If multiple LongCat models exist in the catalog (e.g., `longcat-2.0-preview` and `longcat-3.0`), `addLongcatModelsToSkipModels()` adds ALL enabled LongCat model IDs to `skipModels`. This ensures the session is banned from ALL LongCat models, not just the one that failed. + +### EC-7: Truncated Response After Stream Ends +After a stream completes successfully, the proxy checks `streamedText` for truncation indicators. If detected, the ban is recorded for future requests. The current request's response is already sent — the client receives the truncated response as-is. No retry within the same HTTP request. + +### EC-8: Mixed Model Chunks in Mid-Stream Retry +Mid-stream truncation detection ends the current stream gracefully and returns. No retry within the same HTTP request. The client receives the truncated response. Future requests route to non-LongCat models. \ No newline at end of file diff --git a/.roo/specs/longcat-session-ban/requirements.md b/.roo/specs/longcat-session-ban/requirements.md new file mode 100644 index 00000000..f5ceee36 --- /dev/null +++ b/.roo/specs/longcat-session-ban/requirements.md @@ -0,0 +1,28 @@ +# Requirements: LongCat Session Ban & Fallback + +## Overview + +When a LongCat sticky session encounters an error — whether it's an auth failure (401/40403) or a "truncated" / "conflict" error from the provider — the system must to: + +1. Detect the error, 2. Ban the entire `longcat` platform for this sticky session, 3. Fall back to the next best non-LongCat model via normal routing, 4. Update the sticky session to point to the new fallback model, 5. Never route this session to LongCat again (until the session expires via TTL) + +## Context + + The existing sticky sessions feature lives in [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:13-112). and It uses an SHA-1 hash of `routingMode + firstUserMessage` to identify sessions, and stores `{ modelDbId, + optional `keyId` + `lastUsed` } with a 30-min TTL and 500-entry max. + + and eviction. + + The existing LongCat sticky key feature ([`longcat-sticky-key` spec](../../roo/specs/longcat-sticky-key/)) extends this to also prefer using the **same API key** within a session. For LongCat specifically, because LongCat benefits from session continuity at the key level. same key = same session context on their server side). The current behavior on auth errors (401/403) is to [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) — which clears the sticky key but **keep the sticky model pinned to LongCat** via [`preferredModel`](server/src/routes/proxy.ts:1036-1037). On retry, [`routeRequest()`](server/src/services/router.ts:458) still has `preferredModel` pointing to LongCat, and tries **another LongCat key** via round-robin. and **LongCat detects different keys usage for the same session** → the "multiple API keys" problem. The [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) function explicitly **does NOT skip the model** for auth errors or rate limit errors — so auth failures on LongCat result in key rotation within the same LongCat model, which is exactly what LongCat detects as "multiple API keys use." for the same session. Similarly, when LongCat returns a "truncated" or "conflict" error ( the provider truncates the response mid-stream, the current behavior is to silently switch to a different key and but the session continues on LongCat with a different key — same problem. The "truncated" error pattern is also detected by [`isRetryableError()`](server/src/routes/proxy.ts:409-4428) which checking for 429, 413, 400, 404, 408, 409, 422, 500, 502, 503, 504, andrate limit`, `quota`, `aborted`, `timeout`, `econnrefused`, `econnreset`, `unauthorized`, `forbidden`, `invalid api key`, `no longer available`, `model not found`, `bad request`, `invalid json payload`. TheisRetryableError()` function returns true for all these cases, meaning the proxy will retry with a different model/keykey. However, [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) returns `true` only for rate-limit and auth errors — it does NOT skip the model. This means auth errors and LongCat result in key rotation within the same model, which is exactly the behavior LongCat detects as "multiple API keys use" for the same session. The existing [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) only clears the sticky key but **keoes `preferredKeyId` to `undefined` — but the sticky model remains pinned to LongCat. On the next retry, [`routeRequest()`](server/src/services/router.ts:458) still receives `preferredModel` pointing to LongCat, and tries another LongCat key via round-robin. The LongCat smart-auto preference in [`router.ts`](server/src/services/router.ts:498-527) also means LongCat is still tried first in smart mode, so the retry will likely hit LongCat again. ## Functional Requirements ### FR-1: Detect Multiple Key Use on LongCat When a LongCat provider returns an error indicating that the same API key is been used for the same session ( the system must detect this condition. the error response contains language signaling multiple key use. a single session. Detection patterns: - Auth errors (401/403) — the current behavior already clears the sticky key but but tries another key on the same model - Rate-limit errors (429) — same pattern: key rotation within the same model - "Truncated" / "conflict" errors — LongCat truncates responses mid-stream when the response is shorter than expected, indicating the provider cut off the session. Detection keywords: "truncated", "truncation", "conflict", "length", "maximum length", "context_length_exceeded", "token_limit" - Any error message that the provider is complaining about session length or capacity limits for the current conversation. ### FR-2: Ban LongCat Platform for Sticky Session When FR-1 is triggered, the system must ban the entire `longcat` platform for the current sticky session. This means: - All LongCat model IDs must be added to `skipModels` in the retry loop - The sticky session must be updated to point to the new fallback model instead of LongCat - The session must never be routed to LongCat again for until the session expires via TTL (30 min) or The ban must persist across multiple retry attempts within the same request. ### FR-3: Fallback to Next Best Non-LongCat Model After banning LongCat, the retry loop must fall through to the next best available model via the existing Thompson Sampling / smart routing logic. The new model should be selected based on the normal scoring algorithm ( success rate + speed + TTFB + intelligence for smart mode, success rate + speed in balanced mode). ### FR-4: Update Sticky Session to New Model On successful fallback, the sticky session must be updated to point to the new fallback model and `modelDbId` + `keyId`. The sticky key feature should be cleared for the new model since since the fallback is not LongCat, since the sticky key preference only applies to LongCat sessions. ### FR-5: Never Route Session to LongCat Again Once a session is banned from LongCat, it must never be routed to LongCat again for the remainder of that session's lifetime (30 min TTL). This means: - The `stickySessionMap` entry must include a `bannedPlatforms` field (or `Set`) to track which platforms are banned for this session - On subsequent requests in the same session, `getStickyModel()` returns the preferred model, but the proxy layer must check if the session is banned from LongCat and skip LongCat models before calling `routeRequest()` - The LongCat smart-auto preference in `router.ts` must also be suppressed for banned sessions ( the router should not boost LongCat entries to the front for sessions that are banned from LongCat ### FR-6: Truncated Response Detection When a LongCat streaming response is received, the proxy must check the response content for signs of truncation. If detected, the session must be banned from LongCat immediately, even if the stream has already started (headers sent, the client has already received partial data). The system must: - Log the truncation detection - Record the ban in the sticky session - Add all LongCat model IDs to `skipModels` - Continue the retry loop to the next best model - The client receives the truncated partial response plus the new fallback response, which is acceptable behavior for clients like Hermes that handle partial streams gracefully ### FR-7: Auth Error Handling for LongCat Sessions When an auth error (401/403) occurs on a LongCat sticky session: - Clear the sticky key via `clearStickyKey()` (existing behavior) - Additionally ban the LongCat platform for this session via the new `banPlatformFromSession()` function - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success ### FR-8: Rate-Limit Error Handling for LongCat Sessions When a rate-limit error (429) occurs on a LongCat sticky session: - Ban the LongCat platform for this session via `banPlatformFromSession()` - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success - Note: rate-limit errors on LongCat do NOT clear the entire sticky session (the session may still work with a different key on a different model). Only ban LongCat specifically. ### FR-9: Existing Behavior Preserved for Non-LongCat Sessions All existing sticky session behavior for non-LongCat providers must remain unchanged. The new ban mechanism only applies exclusively to LongCat sessions. Non-LongCat sessions that never have platform bans. ### FR-10: Session Expiry Clears Bans When a sticky session expires ( via TTL (30 min), the `bannedPlatforms` set is also cleared. This is natural — expired sessions are evicted from the `stickySessionMap` entirely, including all associated data. ### FR-11: No Database Schema Changes The ban mechanism is purely in-memory, using the existing `stickySessionMap`. No database schema changes are required. ### FR-12: Minimal Router Changes The router (`server/src/services/router.ts`) should not need significant changes. The only change is that the LongCat smart-auto preference logic should skip sessions that are banned from LongCat. The proxy layer handles all ban detection and session management. The router remains provider-agnostic. ### FR-13: No UI Changes This is a backend-only feature. No client-side changes are needed. ## Non-Functional Requirements ### NFR-1: Backward Compatibility Existing sessions without `bannedPlatforms` (from before this feature or for non-LongCat providers) must continue to work. The `bannedPlatforms` field must be optional in the sticky session map value type. ### NFR-2: Thread Safety The existing `stickySessionMap` is a plain `Map` with no locking ( single-threaded Node.js). The extended map follows the same pattern — no additional concurrency concerns. ### NFR-3: Minimal Performance Impact The ban check adds one `Set` lookup per one optional field check in the sticky session map entry per one DB query to check if a model is on a banned platform. No additional I/O beyond what already exists. ### NFR-4: Test Coverage New unit tests must cover: - Multiple key use detection (auth + rate limit + truncated) - Session ban persistence across retries - Fallback to next best model - Sticky session update on success - Session expiry clearing bans - Non-LongCat sessions unaffected ## Files Requiring Modification | # | File | Change Type | Description | +|---|---|---|---| +| | 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:16) | Edit | Extend `stickySessionMap` value type to include `bannedPlatforms` | +| | 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:34-52) | Edit | Add `isSessionBannedFromPlatform()` helper function | +| | 3 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:54-79) | Edit | Update `getStickyKey()` to check bans before returning key | +| | 4 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:81-87) | Edit | Update `clearStickyModel()` to also clear `bannedPlatforms` | +| | 5 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:100-112) | Edit | Update `setStickyModel()` to also store `bannedPlatforms` | +| | 6 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1039-1053) | Edit | Add `banPlatformFromSession()` function | +| | 7 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1036-1053) | Edit | Add `addLongcatModelsToSkipModels()` helper | + | | 8 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1245-1281) | Edit | Update error handling in retry loop to detect multiple key use + ban LongCat | +| | 9 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1147-1149) | Edit | Add truncated response detection in streaming success path | +| | 10 | [`server/src/services/router.ts`](server/src/services/router.ts:498-527) | Edit | Skip LongCat boost for banned sessions | + | | 11 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1036-1053) | Edit | Pass `bannedPlatforms` to `routeRequest()` via `skipModels` | + | 11 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1036-1053) | Edit | Pass `bannedPlatforms` to `routeRequest()` via `skipModels` | ## Out of Scope | - Persistent bans across server restarts ( in-memory only, same as existing sticky sessions) - Changes to the Thompson Sampling algorithm itself - Changes to rate limiting logic - - Changes to the fallback chain ordering in balanced mode - - Client-side UI changes - - Configuration UI for enabling/disabling bans per provider ( hardcoded to LongCat only - Changes to the `OpenAICompatProvider` class | \ No newline at end of file diff --git a/.roo/specs/longcat-session-ban/tasks.md b/.roo/specs/longcat-session-ban/tasks.md new file mode 100644 index 00000000..594ceb83 --- /dev/null +++ b/.roo/specs/longcat-session-ban/tasks.md @@ -0,0 +1,113 @@ +# Tasks: LongCat Session Ban & Fallback + +## Implementation Steps + +- [x] 1. Extend `stickySessionMap` value type in `proxy.ts` + - Edit line 16: add `bannedPlatforms?: Set` to the map value type + - This is the foundational type change — all other changes depend on it + +- [x] 2. Add `isSessionBannedFromPlatform()` function in `proxy.ts` + - Add after `getStickyKey()` (after line 79) + - Parameters: `messages`, `routingMode`, `platform` + - Returns `boolean` — checks if the session's `bannedPlatforms` set contains the given platform + - Includes TTL check (expired sessions have no bans) + - Add diagnostic logging + + - Add diagnostic logging + +- [x] 3. Add `banPlatformFromSession()` function in `proxy.ts` + - Add after `isSessionBannedFromPlatform()` + - Parameters: `messages`, `routingMode`, `platform` + - Creates or adds to `bannedPlatforms` set in the sticky session entry + - Refreshes `lastUsed` TTL so the ban persists + - Add diagnostic logging with banned platforms list + +- [x] 4. Add `addLongcatModelsToSkipModels()` helper in `proxy.ts` + - Add after `banPlatformFromSession()` + - Queries DB for all enabled LongCat model IDs: `SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1` + - Adds each to the `skipModels` set + - Add diagnostic logging with count and IDs + +- [x] 5. Add `isTruncatedResponse()` function in `proxy.ts` + - Add after `addLongcatModelsToSkipModels()` + - Parameters: `errOrContent: any` + - Returns `boolean` — checks for truncation keywords in stringified input + - Keywords: 'truncated', 'truncation', 'conflict', 'context_length_exceeded', 'token_limit', 'maximum length', 'response_length_limit' + - Case-insensitive matching + +- [x] 6. Update `getStickyKey()` to check session bans + - In `getStickyKey()` (lines 54-79), after TTL check, add: + - Look up the sticky model's platform via DB query + - If the model's platform is in `entry.bannedPlatforms`, return `undefined` + - Add diagnostic logging for skipped sticky keys due to bans + +- [x] 7. Update `setStickyModel()` to preserve `bannedPlatforms` + - In `setStickyModel()` (lines 100-112), before setting the new entry: + - Get existing entry from `stickySessionMap` + - Preserve `bannedPlatforms` from existing entry (if any) + - Include `bannedPlatforms` in the new map entry + - Update log message to include banned platforms count when present + +- [x] 8. Update pre-routing logic in `handleChatCompletion()` + - After determining `preferredModel` and `preferredKeyId` (around lines 1035-1053): + - Check `isSessionBannedFromPlatform(normalizedMessages, routingMode, 'longcat')` + - If banned: + - Call `addLongcatModelsToSkipModels(skipModels)` + - If `preferredModel` points to a LongCat model, set `preferredModel = undefined` and `preferredKeyId = undefined` + - Add diagnostic logging + - Move `skipModels` initialization earlier (before the ban check) or create it at the ban check point + - Note: `skipModels` is currently initialized at line 1058 — need to ensure it exists before the ban check + +- [x] 9. Update error handling in retry loop for LongCat-specific bans + - In the `catch (err)` block (around lines 1245-1282): + - After logging the request error, check if `route.platform === 'longcat'` + - If LongCat + auth error: call `banPlatformFromSession()`, `addLongcatModelsToSkipModels()`, clear `preferredKeyId` + - If LongCat + rate-limit error: call `banPlatformFromSession()`, `addLongcatModelsToSkipModels()`, clear `preferredKeyId` + - If LongCat + truncated response: call `banPlatformFromSession()`, `addLongcatModelsToSkipModels()`, clear `preferredKeyId` + - Keep existing auth error handling for non-LongCat (`clearStickyKey()` + `preferredKeyId = undefined`) + - Keep existing `isRetryableError()` and `shouldSkipModelOnRetry()` logic + - Keep existing non-retryable error handling (`clearStickyModel()`) + +- [x] 10. Add truncated response detection after stream completes + - After the streaming `for await` loop completes (around line 1133-1147): + - Check `route.platform === 'longcat'` and `isTruncatedResponse(streamedText)` + - If detected: call `banPlatformFromSession(normalizedMessages, routingMode, 'longcat')` + - Add diagnostic logging + - Note: the stream has already been sent to the client — no retry within the same request + - Future requests in this session will route to non-LongCat models + - For Responses API streaming: check `responseStreamContext.outputText` instead of `streamedText` + +- [x] 11. Add truncation detection in mid-stream error handling + - In the `catch (streamErr)` block for mid-stream errors (around lines 1185-1214): + - Check `route.platform === 'longcat'` and `isTruncatedResponse(streamErr.message)` + - If detected: + - Call `banPlatformFromSession(normalizedMessages, routingMode, 'longcat')` + - End the stream gracefully (send completion event, not error event) + - Return — client receives truncated response as-is + - If not truncation: keep existing mid-stream error behavior (send error SSE event + return) + + - Add diagnostic logging for both paths + +- [x] 12. Verify TypeScript compilation + - Run `npx tsc --noEmit` in the `server/` directory + - Ensure no type errors from the new `bannedPlatforms` field or new functions + +- [ ] 13. Run existing tests + - Run `npm test` in the `server/` directory + - Verify no regressions in router tests, proxy tests, or sticky session behavior + +- [ ] 14. Add new unit tests for ban functionality + - Test `isSessionBannedFromPlatform()` — no session, expired session, banned session, non-banned session + - Test `banPlatformFromSession()` — adds platform to banned set, preserves existing bans + - Test `isTruncatedResponse()` — various truncation keywords, non-truncation strings + - Test `addLongcatModelsToSkipModels()` — adds LongCat model IDs to skip set + - Test `setStickyModel()` preserves `bannedPlatforms` when updating sticky model + - Test `getStickyKey()` returns `undefined` when session is banned from model's platform + +- [ ] 15. Manual integration testing + - Add a LongCat API key via the Keys page + - Send a chat completion request and verify it routes through LongCat + - Send a second request with same first user message — verify sticky key is used + - Simulate auth error (disable key mid-session) — verify LongCat is banned and fallback occurs + - Send a third request — verify it routes to non-LongCat model ( LongCat is still banned) + - Wait for session TTL to expire (30 min) — verify LongCat is no longer banned \ No newline at end of file diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index b5ddd66d..799b8386 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -13,7 +13,7 @@ export const proxyRouter: Router = Router(); // Sticky sessions: track which model served each "session" // Key: hash of first user message → model_db_id // This prevents model switching mid-conversation which causes hallucination -const stickySessionMap = new Map(); +const stickySessionMap = new Map; lastUsed: number }>(); const STICKY_TTL_MS = 30 * 60 * 1000; // 30 min session TTL const responseSessionMap = new Map(); const responseItemMap = new Map(); @@ -70,6 +70,16 @@ function getStickyKey(messages: ChatMessage[], routingMode: RoutingMode): number return undefined; } + // If session is banned from the sticky model's platform, don't return sticky key + if (entry.bannedPlatforms) { + const db = getDb(); + const modelRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(entry.modelDbId) as { platform: string } | undefined; + if (modelRow && entry.bannedPlatforms.has(modelRow.platform)) { + console.log(`[Sticky] key skipped session=${key.slice(0, 8)} | model platform=${modelRow.platform} is banned`); + return undefined; + } + } + if (entry.keyId !== undefined) { console.log(`[Sticky] key hit session=${key.slice(0, 8)} → keyId=${entry.keyId}`); } else { @@ -78,6 +88,58 @@ function getStickyKey(messages: ChatMessage[], routingMode: RoutingMode): number return entry.keyId; } +function isSessionBannedFromPlatform( + messages: ChatMessage[], + routingMode: RoutingMode, + platform: string, +): boolean { + const key = getSessionKey(messages, routingMode); + if (!key) return false; + const entry = stickySessionMap.get(key); + if (!entry) return false; + if (Date.now() - entry.lastUsed > STICKY_TTL_MS) return false; // expired = no ban + return entry.bannedPlatforms?.has(platform) ?? false; +} + +function banPlatformFromSession( + messages: ChatMessage[], + routingMode: RoutingMode, + platform: string, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (!entry.bannedPlatforms) entry.bannedPlatforms = new Set(); + entry.bannedPlatforms.add(platform); + entry.lastUsed = Date.now(); // refresh TTL so the ban persists + stickySessionMap.set(key, entry); + console.log(`[Sticky] banned platform=${platform} for session=${key.slice(0, 8)} | bannedPlatforms=${Array.from(entry.bannedPlatforms).join(',')}`); +} + +function addLongcatModelsToSkipModels(skipModels: Set): void { + const db = getDb(); + const longcatModels = db.prepare( + 'SELECT id FROM models WHERE platform = ? AND enabled = 1' + ).all('longcat') as Array<{ id: number }>; + for (const m of longcatModels) { + skipModels.add(m.id); + } + console.log(`[Sticky] added ${longcatModels.length} longcat model(s) to skipModels: [${longcatModels.map(m => m.id).join(',')}]`); +} + +function isTruncatedResponse(errOrContent: any): boolean { + if (!errOrContent) return false; + const str = String(errOrContent).toLowerCase(); + return str.includes('truncated') + || str.includes('truncation') + || str.includes('conflict') + || str.includes('context_length_exceeded') + || str.includes('token_limit') + || str.includes('maximum length') + || str.includes('response_length_limit'); +} + function clearStickyModel(messages: ChatMessage[], routingMode: RoutingMode) { const key = getSessionKey(messages, routingMode); if (!key) return; @@ -100,8 +162,13 @@ function clearStickyKey(messages: ChatMessage[], routingMode: RoutingMode) { function setStickyModel(messages: ChatMessage[], modelDbId: number, routingMode: RoutingMode, keyId?: number) { const key = getSessionKey(messages, routingMode); if (!key) return; - stickySessionMap.set(key, { modelDbId, keyId, lastUsed: Date.now() }); - console.log(`[Sticky] set key=${key.slice(0, 8)} | msgs=${messages.length} → modelDbId=${modelDbId}${keyId !== undefined ? ` keyId=${keyId}` : ''}`); + + // Preserve bannedPlatforms from existing entry (if session was previously banned) + const existing = stickySessionMap.get(key); + const bannedPlatforms = existing?.bannedPlatforms; + + stickySessionMap.set(key, { modelDbId, keyId, bannedPlatforms, lastUsed: Date.now() }); + console.log(`[Sticky] set key=${key.slice(0, 8)} | msgs=${messages.length} → modelDbId=${modelDbId}${keyId !== undefined ? ` keyId=${keyId}` : ''}${bannedPlatforms && bannedPlatforms.size > 0 ? ` banned=${Array.from(bannedPlatforms).join(',')}` : ''}`); if (stickySessionMap.size > 500) { const now = Date.now(); @@ -1052,10 +1119,25 @@ async function handleChatCompletion( } } + // Check if session is banned from LongCat — if so, skip all LongCat models + // and clear any preferredModel/preferredKeyId that points to LongCat. + const skipModels = new Set(); + if (isSessionBannedFromPlatform(normalizedMessages, routingMode, 'longcat')) { + addLongcatModelsToSkipModels(skipModels); + if (preferredModel) { + const db = getDb(); + const row = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (row?.platform === 'longcat') { + console.log(`[Sticky] skipping preferredModel=${preferredModel} (longcat banned for session)`); + preferredModel = undefined; + preferredKeyId = undefined; + } + } + } + // Retry loop: skip bad keys and, for non-rate-limit errors, skip the model // entirely so the fallback chain can move to a different provider/model. const skipKeys = new Set(); - const skipModels = new Set(); let lastError: any = null; for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { @@ -1132,6 +1214,17 @@ async function handleChatCompletion( } } + // Check for truncated response content after stream completes on LongCat. + // The stream has already been sent to the client — no retry within same request. + // Future requests in this session will route to non-LongCat models. + if (route.platform === 'longcat') { + const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; + if (isTruncatedResponse(streamTextToCheck)) { + console.warn(`[Proxy] LongCat truncated stream content detected — banning longcat for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + } + } + if (!streamStarted) { // Upstream returned no chunks — emit minimal successful stream. res.setHeader('Content-Type', 'text/event-stream'); @@ -1184,6 +1277,30 @@ async function handleChatCompletion( return; } catch (streamErr: any) { if (streamStarted) { + // Check for LongCat truncation error mid-stream — end gracefully, not with error event + if (route.platform === 'longcat' && isTruncatedResponse(streamErr.message)) { + console.warn(`[Proxy] LongCat truncation error mid-stream — banning longcat for session, ending stream gracefully`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + try { + if (responseStreamContext) { + writeResponseStreamEvent(res, { + type: 'response.completed', + response: { + id: responseStreamContext.responseId, + status: 'completed', + model: route.modelId, + output_text: responseStreamContext.outputText, + }, + }); + } else { + res.write('data: [DONE]\n\n'); + } + res.end(); + } catch { /* socket gone */ } + logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, totalOutputTokens, Date.now() - start, ttfbMs, streamErr.message); + return; + } + // Mid-stream error — finish the SSE response cleanly instead of leaving // the client hanging or letting Express's default handler take over. // Full upstream message goes to the log; the client sees a generic @@ -1246,6 +1363,28 @@ async function handleChatCompletion( const latency = Date.now() - start; logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, 0, latency, null, err.message); + // Detect LongCat multiple-key-use errors and ban the platform for the session + if (route.platform === 'longcat') { + if (isAuthError(err)) { + console.warn(`[Proxy] LongCat auth error — banning longcat for session (multiple key use detected)`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + } + if (isRateLimitError(err)) { + console.warn(`[Proxy] LongCat rate-limit error — banning longcat for session (key rotation detected)`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + } + if (isTruncatedResponse(err.message) || isTruncatedResponse(err?.responseBody)) { + console.warn(`[Proxy] LongCat truncated response — banning longcat for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + addLongcatModelsToSkipModels(skipModels); + preferredKeyId = undefined; + } + } + if (isRetryableError(err)) { const skipId = `${route.platform}:${route.modelId}:${route.keyId}`; skipKeys.add(skipId); @@ -1255,9 +1394,10 @@ async function handleChatCompletion( if (isRateLimitError(err)) { setCooldown(route.platform, route.modelId, route.keyId, 120_000); } - // Auth errors (401/403): clear the sticky key for this session so the - // retry unpins the broken key and falls through to round-robin. - if (isAuthError(err)) { + // Auth errors (401/403) on non-LongCat: clear the sticky key for this session + // so the retry unpins the broken key and falls through to round-robin. + // LongCat auth errors are handled above with platform ban instead. + if (isAuthError(err) && route.platform !== 'longcat') { const authStatus = getErrorStatus(err); console.warn(`[Proxy] auth error ${authStatus} from ${route.displayName}/${route.modelId}, clearing sticky key for session`); clearStickyKey(normalizedMessages, routingMode); From 27a9f2fd5e289de9cc61cb486d3e6da456a2771c Mon Sep 17 00:00:00 2001 From: vi Date: Mon, 1 Jun 2026 22:06:14 +0300 Subject: [PATCH 2/8] fix: rm excessive checks --- .roo/specs/longcat-session-ban/design.md | 6 ------ .roo/specs/longcat-session-ban/tasks.md | 2 +- server/src/routes/proxy.ts | 5 ----- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/.roo/specs/longcat-session-ban/design.md b/.roo/specs/longcat-session-ban/design.md index 4e566890..f986b7a2 100644 --- a/.roo/specs/longcat-session-ban/design.md +++ b/.roo/specs/longcat-session-ban/design.md @@ -130,12 +130,6 @@ function isTruncatedResponse(errOrContent: any): boolean { // Truncation indicators from LongCat and similar providers return str.includes('truncated') || str.includes('truncation') - || str.includes('conflict') - || str.includes('context_length_exceeded') - || str.includes('token_limit') - || str.includes('maximum length') - || str.includes('response_length_limit'); -} ``` ## Component Changes diff --git a/.roo/specs/longcat-session-ban/tasks.md b/.roo/specs/longcat-session-ban/tasks.md index 594ceb83..8b6535dd 100644 --- a/.roo/specs/longcat-session-ban/tasks.md +++ b/.roo/specs/longcat-session-ban/tasks.md @@ -32,7 +32,7 @@ - Add after `addLongcatModelsToSkipModels()` - Parameters: `errOrContent: any` - Returns `boolean` — checks for truncation keywords in stringified input - - Keywords: 'truncated', 'truncation', 'conflict', 'context_length_exceeded', 'token_limit', 'maximum length', 'response_length_limit' + - Keywords: 'truncated', 'truncation' - Case-insensitive matching - [x] 6. Update `getStickyKey()` to check session bans diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 799b8386..59c837d6 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -133,11 +133,6 @@ function isTruncatedResponse(errOrContent: any): boolean { const str = String(errOrContent).toLowerCase(); return str.includes('truncated') || str.includes('truncation') - || str.includes('conflict') - || str.includes('context_length_exceeded') - || str.includes('token_limit') - || str.includes('maximum length') - || str.includes('response_length_limit'); } function clearStickyModel(messages: ChatMessage[], routingMode: RoutingMode) { From 8772ba487abd0f16979649718b08d2a6cc18709c Mon Sep 17 00:00:00 2001 From: vi Date: Mon, 1 Jun 2026 22:41:59 +0300 Subject: [PATCH 3/8] test: add units --- .../routes/longcat-session-ban.test.ts | 265 ++++++++++++++++++ server/src/routes/proxy.ts | 14 + 2 files changed, 279 insertions(+) create mode 100644 server/src/__tests__/routes/longcat-session-ban.test.ts diff --git a/server/src/__tests__/routes/longcat-session-ban.test.ts b/server/src/__tests__/routes/longcat-session-ban.test.ts new file mode 100644 index 00000000..1cc1ddeb --- /dev/null +++ b/server/src/__tests__/routes/longcat-session-ban.test.ts @@ -0,0 +1,265 @@ +import { describe, it, expect, beforeAll, beforeEach, afterEach, vi } from 'vitest'; +import type { Express } from 'express'; +import { createApp } from '../../app.js'; +import { initDb, getDb, getUnifiedApiKey } from '../../db/index.js'; +import { + isSessionBannedFromPlatform, + banPlatformFromSession, + addLongcatModelsToSkipModels, + isTruncatedResponse, + getSessionKey, + getStickyModel, + setStickyModel, + stickySessionMap, +} from '../../routes/proxy.js'; + +function clearStickyMap() { + (stickySessionMap as Map).clear(); +} + +describe('LongCat session ban functionality', () => { + let app: Express; + + beforeAll(() => { + process.env.ENCRYPTION_KEY = '0'.repeat(64); + initDb(':memory:'); + app = createApp(); + }); + + beforeEach(() => { + clearStickyMap(); + const db = getDb(); + db.prepare('DELETE FROM api_keys').run(); + db.prepare('DELETE FROM requests').run(); + // Insert a dummy LongCat API key so routing can succeed if needed + db.prepare(`INSERT INTO api_keys (platform, label, encrypted_key, iv, auth_tag, status, enabled) + VALUES ('longcat', 'test', 'enc', 'iv', 'tag', 'healthy', 1)`).run(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + // Helper to create a simple user message array + const makeMessages = (content: string) => [{ role: 'user' as const, content }]; + + // ---------- Test Suite 1: isSessionBannedFromPlatform ---------- + describe('isSessionBannedFromPlatform', () => { + it('returns false when no sticky session exists', () => { + const messages = makeMessages('Hello'); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns false when sticky session exists but no bannedPlatforms', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { modelDbId: 1, lastUsed: Date.now() }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns true when the platform is in bannedPlatforms', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now(), + bannedPlatforms: new Set(['longcat']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + }); + + it('returns false when a different platform is banned', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now(), + bannedPlatforms: new Set(['groq']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns false when the sticky session has expired (past TTL)', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now() - (31 * 60 * 1000), // 31 minutes ago + bannedPlatforms: new Set(['longcat']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + }); + + // ---------- Test Suite 2: banPlatformFromSession ---------- + describe('banPlatformFromSession', () => { + it('does not create entry if none exists (only modifies existing)', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + expect(stickySessionMap.has(key)).toBe(false); + banPlatformFromSession(messages, 'balanced', 'longcat'); + expect(stickySessionMap.has(key)).toBe(false); + }); + + it('adds to existing bannedPlatforms if entry already exists', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 2, + lastUsed: Date.now(), + bannedPlatforms: new Set(['groq']), + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.bannedPlatforms.has('groq')).toBe(true); + expect(entry.bannedPlatforms.has('longcat')).toBe(true); + }); + + it('does not duplicate platforms already banned', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 3, + lastUsed: Date.now(), + bannedPlatforms: new Set(['longcat']), + }); + const beforeSize = stickySessionMap.get(key).bannedPlatforms.size; + banPlatformFromSession(messages, 'balanced', 'longcat'); + const afterSize = stickySessionMap.get(key).bannedPlatforms.size; + expect(afterSize).toBe(beforeSize); + }); + + it('preserves existing modelDbId and keyId when banning', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 42, + keyId: 7, + lastUsed: Date.now(), + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.modelDbId).toBe(42); + expect(entry.keyId).toBe(7); + }); + + it('refreshes lastUsed TTL when banning', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const oldTime = Date.now() - (20 * 60 * 1000); // 20 minutes ago + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: oldTime, + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.lastUsed).toBeGreaterThan(oldTime); + }); + }); + + // ---------- Test Suite 3: addLongcatModelsToSkipModels ---------- + describe('addLongcatModelsToSkipModels', () => { + it('adds all LongCat model IDs to the skipModels set', () => { + const skipModels = new Set(); + addLongcatModelsToSkipModels(skipModels); + const db = getDb(); + const longcatRows = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").all() as any[]; + const ids = longcatRows.map(r => r.id); + ids.forEach(id => expect(skipModels.has(id)).toBe(true)); + }); + + it('does not add non-LongCat model IDs', () => { + const skipModels = new Set(); + addLongcatModelsToSkipModels(skipModels); + const db = getDb(); + const otherRows = db.prepare("SELECT id FROM models WHERE platform != 'longcat' AND enabled = 1").all() as any[]; + otherRows.forEach(r => expect(skipModels.has(r.id)).toBe(false)); + }); + + it('handles empty LongCat model list gracefully', () => { + const db = getDb(); + db.prepare('PRAGMA foreign_keys = OFF').run(); + try { + db.prepare("DELETE FROM models WHERE platform = 'longcat'").run(); + const skipModels = new Set(); + expect(() => addLongcatModelsToSkipModels(skipModels)).not.toThrow(); + expect(skipModels.size).toBe(0); + } finally { + db.prepare('PRAGMA foreign_keys = ON').run(); + } + // Restore by re-initializing DB for subsequent tests + initDb(':memory:'); + }); + }); + + // ---------- Test Suite 4: isTruncatedResponse ---------- + describe('isTruncatedResponse', () => { + const truncationSamples = [ + 'Response was truncated due to length', + 'Truncation error occurred', + 'This response was truncated', + 'truncation detected', + ]; + + truncationSamples.forEach(sample => { + it(`returns true for string containing '${sample}'`, () => { + expect(isTruncatedResponse(sample)).toBe(true); + }); + }); + + it('returns false for normal error messages', () => { + expect(isTruncatedResponse('Invalid API key')).toBe(false); + }); + + it('returns false for empty strings', () => { + expect(isTruncatedResponse('')).toBe(false); + }); + + it('handles non-string input gracefully', () => { + // isTruncatedResponse converts to string via String(), so objects become "[object Object]" + expect(isTruncatedResponse({ message: 'truncated' })).toBe(false); + expect(isTruncatedResponse(null)).toBe(false); + expect(isTruncatedResponse(undefined)).toBe(false); + }); + }); + + // ---------- Integration Tests ---------- + describe('Integration: ban lifecycle', () => { + it('ban persists across model changes and expires after TTL', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + setStickyModel(messages, longcatRow.id, 'balanced'); + // Ban LongCat for this session + banPlatformFromSession(messages, 'balanced', 'longcat'); + // getStickyModel still returns the model (ban check is in routing logic, not getStickyModel) + expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); + // But isSessionBannedFromPlatform should return true + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + // Simulate TTL expiration by adjusting lastUsed + const entry = stickySessionMap.get(key); + entry.lastUsed = Date.now() - (31 * 60 * 1000); // 31 minutes + // After expiration, ban should be considered cleared + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('ban check and skipModels work together to prevent banned platform selection', () => { + const messages = makeMessages('Hello'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + // Set sticky model to a LongCat model + setStickyModel(messages, longcatRow.id, 'balanced'); + // Verify sticky model is set + expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); + // Ban LongCat for this session + banPlatformFromSession(messages, 'balanced', 'longcat'); + // Verify ban is registered + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + // Verify addLongcatModelsToSkipModels includes the banned model + const skipModels = new Set(); + addLongcatModelsToSkipModels(skipModels); + expect(skipModels.has(longcatRow.id)).toBe(true); + }); + }); +}); diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 59c837d6..245b7c0e 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -135,6 +135,20 @@ function isTruncatedResponse(errOrContent: any): boolean { || str.includes('truncation') } +// Exported for testing purposes only +export { + isSessionBannedFromPlatform, + banPlatformFromSession, + addLongcatModelsToSkipModels, + isTruncatedResponse, + getSessionKey, + getStickyModel, + getStickyKey, + setStickyModel, + clearStickyModel, + stickySessionMap, +}; + function clearStickyModel(messages: ChatMessage[], routingMode: RoutingMode) { const key = getSessionKey(messages, routingMode); if (!key) return; From 7f5e0b2c0ed99f9e150b736d6c00a173d2f54187 Mon Sep 17 00:00:00 2001 From: vi Date: Tue, 2 Jun 2026 00:05:35 +0300 Subject: [PATCH 4/8] feat(proxy): generalize provider session ban to all providers via 5xx consecutive failures - Extend stickySessionMap with consecutiveFailures tracking per provider - Add recordConsecutiveFailure(), resetConsecutiveFailures(), resetAllConsecutiveFailures() - Replace addLongcatModelsToSkipModels with generic addProviderModelsToSkipModels - Replace LongCat-specific auth/rate-limit ban with general 5xx consecutive failure detection (threshold: 2) - Generalize truncation detection to all providers (post-stream + mid-stream) - Update getStickyKey() to check bannedPlatforms for any platform - Update pre-routing ban check to be generic (any banned platform) - Add success path counter reset on both streaming and non-streaming paths - Remove LongCat-specific auth error ban, rate limit ban, and addLongcatModelsToSkipModels - Rename and rewrite tests from longcat-session-ban to provider-session-ban (32 test cases) - TypeScript compiles cleanly, all 150 tests pass --- .roo/specs/longcat-session-ban/design.md | 18 +- .../specs/longcat-session-ban/requirements.md | 2 +- .roo/specs/provider-5xx-session-ban/design.md | 540 ++++++++++++++++++ .../provider-5xx-session-ban/requirements.md | 60 ++ .roo/specs/provider-5xx-session-ban/tasks.md | 130 +++++ .../routes/longcat-session-ban.test.ts | 265 --------- .../routes/provider-session-ban.test.ts | 451 +++++++++++++++ server/src/routes/proxy.ts | 213 ++++--- 8 files changed, 1335 insertions(+), 344 deletions(-) create mode 100644 .roo/specs/provider-5xx-session-ban/design.md create mode 100644 .roo/specs/provider-5xx-session-ban/requirements.md create mode 100644 .roo/specs/provider-5xx-session-ban/tasks.md delete mode 100644 server/src/__tests__/routes/longcat-session-ban.test.ts create mode 100644 server/src/__tests__/routes/provider-session-ban.test.ts diff --git a/.roo/specs/longcat-session-ban/design.md b/.roo/specs/longcat-session-ban/design.md index f986b7a2..26acc4dc 100644 --- a/.roo/specs/longcat-session-ban/design.md +++ b/.roo/specs/longcat-session-ban/design.md @@ -57,7 +57,7 @@ Extended to: { modelDbId: number; keyId?: number; bannedPlatforms?: Set; lastUsed: number } ``` -The `bannedPlatforms` field is optional for backward compatibility. Existing entries without it default to anundefined` (no bans). Non-LongCat sessions never have bans. +The `bannedPlatforms` field is optional for backward compatibility. Existing entries without it default to `undefined` (no bans). Non-LongCat sessions never have bans. ## New Functions @@ -384,10 +384,10 @@ No change needed — the existing boost logic already checks `hasCapacity` by qu ```mermaid flowchart TD ERR[Provider Error] --> TYPE{Error Type?} - TYPE --> |Auth 401/403?` --> BAN1[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] - TYPE --> `Rate Limit 429?` --> BAN2[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] - TYPE --> `Truncated Response?` --> BAN3[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] - TYPE --> `Other Error?` --> EXIST[Existing Retry Behavior] + TYPE -->|Auth 401/403?| BAN1[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE -->|Rate Limit 429?| BAN2[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE -->|Truncated Response?| BAN3[Ban LongCat + Skip LongCat Models + Clear preferredKeyId] + TYPE -->|Other Error?| EXIST[Existing Retry Behavior] BAN1 --> RETRY[Continue Retry Loop - Fall Back to Next Model] BAN2 --> RETRY @@ -395,11 +395,11 @@ flowchart TD subgraph Stream Detection STREAM[Stream Comple] --> CHECK{Check streamedText for Truncation} - CHECK -->|Truncated|` BAN4[Ban LongCat + Record for Future Requests] - CHECK -->|Not Truncated|` SUCCESS[Normal Success Path] + CHECK -->|Truncated| BAN4[Ban LongCat + Record for Future Requests] + CHECK -->|Not Truncated| SUCCESS[Normal Success Path] MID[Mid-Stream Error] --> CHECK2{Check Error Message for Truncation} - CHECK2 -->|Truncated|` BAN5[Ban LongCat + End Stream Gracefully + Return] - CHECK2 -->|Not Truncated|` ERRSEND[Send Error SSE Event + Return] + CHECK2 -->|Truncated| BAN5[Ban LongCat + End Stream Gracefully + Return] + CHECK2 -->|Not Truncated| ERRSEND[Send Error SSE Event + Return] end ``` diff --git a/.roo/specs/longcat-session-ban/requirements.md b/.roo/specs/longcat-session-ban/requirements.md index f5ceee36..bd1d065c 100644 --- a/.roo/specs/longcat-session-ban/requirements.md +++ b/.roo/specs/longcat-session-ban/requirements.md @@ -12,7 +12,7 @@ When a LongCat sticky session encounters an error — whether it's an auth failu and eviction. - The existing LongCat sticky key feature ([`longcat-sticky-key` spec](../../roo/specs/longcat-sticky-key/)) extends this to also prefer using the **same API key** within a session. For LongCat specifically, because LongCat benefits from session continuity at the key level. same key = same session context on their server side). The current behavior on auth errors (401/403) is to [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) — which clears the sticky key but **keep the sticky model pinned to LongCat** via [`preferredModel`](server/src/routes/proxy.ts:1036-1037). On retry, [`routeRequest()`](server/src/services/router.ts:458) still has `preferredModel` pointing to LongCat, and tries **another LongCat key** via round-robin. and **LongCat detects different keys usage for the same session** → the "multiple API keys" problem. The [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) function explicitly **does NOT skip the model** for auth errors or rate limit errors — so auth failures on LongCat result in key rotation within the same LongCat model, which is exactly what LongCat detects as "multiple API keys use." for the same session. Similarly, when LongCat returns a "truncated" or "conflict" error ( the provider truncates the response mid-stream, the current behavior is to silently switch to a different key and but the session continues on LongCat with a different key — same problem. The "truncated" error pattern is also detected by [`isRetryableError()`](server/src/routes/proxy.ts:409-4428) which checking for 429, 413, 400, 404, 408, 409, 422, 500, 502, 503, 504, andrate limit`, `quota`, `aborted`, `timeout`, `econnrefused`, `econnreset`, `unauthorized`, `forbidden`, `invalid api key`, `no longer available`, `model not found`, `bad request`, `invalid json payload`. TheisRetryableError()` function returns true for all these cases, meaning the proxy will retry with a different model/keykey. However, [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) returns `true` only for rate-limit and auth errors — it does NOT skip the model. This means auth errors and LongCat result in key rotation within the same model, which is exactly the behavior LongCat detects as "multiple API keys use" for the same session. The existing [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) only clears the sticky key but **keoes `preferredKeyId` to `undefined` — but the sticky model remains pinned to LongCat. On the next retry, [`routeRequest()`](server/src/services/router.ts:458) still receives `preferredModel` pointing to LongCat, and tries another LongCat key via round-robin. The LongCat smart-auto preference in [`router.ts`](server/src/services/router.ts:498-527) also means LongCat is still tried first in smart mode, so the retry will likely hit LongCat again. ## Functional Requirements ### FR-1: Detect Multiple Key Use on LongCat When a LongCat provider returns an error indicating that the same API key is been used for the same session ( the system must detect this condition. the error response contains language signaling multiple key use. a single session. Detection patterns: - Auth errors (401/403) — the current behavior already clears the sticky key but but tries another key on the same model - Rate-limit errors (429) — same pattern: key rotation within the same model - "Truncated" / "conflict" errors — LongCat truncates responses mid-stream when the response is shorter than expected, indicating the provider cut off the session. Detection keywords: "truncated", "truncation", "conflict", "length", "maximum length", "context_length_exceeded", "token_limit" - Any error message that the provider is complaining about session length or capacity limits for the current conversation. ### FR-2: Ban LongCat Platform for Sticky Session When FR-1 is triggered, the system must ban the entire `longcat` platform for the current sticky session. This means: - All LongCat model IDs must be added to `skipModels` in the retry loop - The sticky session must be updated to point to the new fallback model instead of LongCat - The session must never be routed to LongCat again for until the session expires via TTL (30 min) or The ban must persist across multiple retry attempts within the same request. ### FR-3: Fallback to Next Best Non-LongCat Model After banning LongCat, the retry loop must fall through to the next best available model via the existing Thompson Sampling / smart routing logic. The new model should be selected based on the normal scoring algorithm ( success rate + speed + TTFB + intelligence for smart mode, success rate + speed in balanced mode). ### FR-4: Update Sticky Session to New Model On successful fallback, the sticky session must be updated to point to the new fallback model and `modelDbId` + `keyId`. The sticky key feature should be cleared for the new model since since the fallback is not LongCat, since the sticky key preference only applies to LongCat sessions. ### FR-5: Never Route Session to LongCat Again Once a session is banned from LongCat, it must never be routed to LongCat again for the remainder of that session's lifetime (30 min TTL). This means: - The `stickySessionMap` entry must include a `bannedPlatforms` field (or `Set`) to track which platforms are banned for this session - On subsequent requests in the same session, `getStickyModel()` returns the preferred model, but the proxy layer must check if the session is banned from LongCat and skip LongCat models before calling `routeRequest()` - The LongCat smart-auto preference in `router.ts` must also be suppressed for banned sessions ( the router should not boost LongCat entries to the front for sessions that are banned from LongCat ### FR-6: Truncated Response Detection When a LongCat streaming response is received, the proxy must check the response content for signs of truncation. If detected, the session must be banned from LongCat immediately, even if the stream has already started (headers sent, the client has already received partial data). The system must: - Log the truncation detection - Record the ban in the sticky session - Add all LongCat model IDs to `skipModels` - Continue the retry loop to the next best model - The client receives the truncated partial response plus the new fallback response, which is acceptable behavior for clients like Hermes that handle partial streams gracefully ### FR-7: Auth Error Handling for LongCat Sessions When an auth error (401/403) occurs on a LongCat sticky session: - Clear the sticky key via `clearStickyKey()` (existing behavior) - Additionally ban the LongCat platform for this session via the new `banPlatformFromSession()` function - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success ### FR-8: Rate-Limit Error Handling for LongCat Sessions When a rate-limit error (429) occurs on a LongCat sticky session: - Ban the LongCat platform for this session via `banPlatformFromSession()` - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success - Note: rate-limit errors on LongCat do NOT clear the entire sticky session (the session may still work with a different key on a different model). Only ban LongCat specifically. ### FR-9: Existing Behavior Preserved for Non-LongCat Sessions All existing sticky session behavior for non-LongCat providers must remain unchanged. The new ban mechanism only applies exclusively to LongCat sessions. Non-LongCat sessions that never have platform bans. ### FR-10: Session Expiry Clears Bans When a sticky session expires ( via TTL (30 min), the `bannedPlatforms` set is also cleared. This is natural — expired sessions are evicted from the `stickySessionMap` entirely, including all associated data. ### FR-11: No Database Schema Changes The ban mechanism is purely in-memory, using the existing `stickySessionMap`. No database schema changes are required. ### FR-12: Minimal Router Changes The router (`server/src/services/router.ts`) should not need significant changes. The only change is that the LongCat smart-auto preference logic should skip sessions that are banned from LongCat. The proxy layer handles all ban detection and session management. The router remains provider-agnostic. ### FR-13: No UI Changes This is a backend-only feature. No client-side changes are needed. ## Non-Functional Requirements ### NFR-1: Backward Compatibility Existing sessions without `bannedPlatforms` (from before this feature or for non-LongCat providers) must continue to work. The `bannedPlatforms` field must be optional in the sticky session map value type. ### NFR-2: Thread Safety The existing `stickySessionMap` is a plain `Map` with no locking ( single-threaded Node.js). The extended map follows the same pattern — no additional concurrency concerns. ### NFR-3: Minimal Performance Impact The ban check adds one `Set` lookup per one optional field check in the sticky session map entry per one DB query to check if a model is on a banned platform. No additional I/O beyond what already exists. ### NFR-4: Test Coverage New unit tests must cover: - Multiple key use detection (auth + rate limit + truncated) - Session ban persistence across retries - Fallback to next best model - Sticky session update on success - Session expiry clearing bans - Non-LongCat sessions unaffected ## Files Requiring Modification | # | File | Change Type | Description | + The existing LongCat sticky key feature ([`longcat-sticky-key` spec](../../roo/specs/longcat-sticky-key/)) extends this to also prefer using the **same API key** within a session. For LongCat specifically, because LongCat benefits from session continuity at the key level. same key = same session context on their server side). The current behavior on auth errors (401/403) is to [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) — which clears the sticky key but **keep the sticky model pinned to LongCat** via [`preferredModel`](server/src/routes/proxy.ts:1036-1037). On retry, [`routeRequest()`](server/src/services/router.ts:458) still has `preferredModel` pointing to LongCat, and tries **another LongCat key** via round-robin. and **LongCat detects different keys usage for the same session** → the "multiple API keys" problem. The [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) function explicitly **does NOT skip the model** for auth errors or rate limit errors — so auth failures on LongCat result in key rotation within the same LongCat model, which is exactly what LongCat detects as "multiple API keys use." for the same session. Similarly, when LongCat returns a "truncated" or "conflict" error ( the provider truncates the response mid-stream, the current behavior is to silently switch to a different key and but the session continues on LongCat with a different key — same problem. The "truncated" error pattern is also detected by [`isRetryableError()`](server/src/routes/proxy.ts:409-4428) which checking for 429, 413, 400, 404, 408, 409, 422, 500, 502, 503, 504, andrate limit`, `quota`, `aborted`, `timeout`, `econnrefused`, `econnreset`, `unauthorized`, `forbidden`, `invalid api key`, `no longer available`, `model not found`, `bad request`, `invalid json payload`. TheisRetryableError()` function returns true for all these cases, meaning the proxy will retry with a different model/keykey. However, [`shouldSkipModelOnRetry()`](server/src/routes/proxy.ts:430-432) returns `true` only for rate-limit and auth errors — it does NOT skip the model. This means auth errors and LongCat result in key rotation within the same model, which is exactly the behavior LongCat detects as "multiple API keys use" for the same session. The existing [`clearStickyKey()`](server/src/routes/proxy.ts:89-98) only clears the sticky key but **keoes `preferredKeyId` to `undefined` — but the sticky model remains pinned to LongCat. On the next retry, [`routeRequest()`](server/src/services/router.ts:458) still receives `preferredModel` pointing to LongCat, and tries another LongCat key via round-robin. The LongCat smart-auto preference in [`router.ts`](server/src/services/router.ts:498-527) also means LongCat is still tried first in smart mode, so the retry will likely hit LongCat again. ## Functional Requirements ### FR-1: Detect Multiple Key Use on LongCat When a LongCat provider returns an error indicating that the same API key is been used for the same session ( the system must detect this condition. the error response contains language signaling multiple key use. a single session. Detection patterns: - Auth errors (401/403) — the current behavior already clears the sticky key but but tries another key on the same model - Rate-limit errors (429) — same pattern: key rotation within the same model - "Truncated" / "conflict" errors — LongCat truncates responses mid-stream when the response is shorter than expected, indicating the provider cut off the session. Detection keywords: "truncated", "truncation", "conflict", "length", "maximum length", "context_length_exceeded", "token_limit" - Any error message that the provider is complaining about session length or capacity limits for the current conversation. ### FR-2: Ban LongCat Platform for Sticky Session When FR-1 is triggered, the system must ban the entire `longcat` platform for the current sticky session. This means: - All LongCat model IDs must be added to `skipModels` in the retry loop - The sticky session must be updated to point to the new fallback model instead of LongCat - The session must never be routed to LongCat again for until the session expires via TTL (30 min) or The ban must persist across multiple retry attempts within the same request. ### FR-3: Fallback to Next Best Non-LongCat Model After banning LongCat, the retry loop must fall through to the next best available model via the existing Thompson Sampling / smart routing logic. The new model should be selected based on the normal scoring algorithm ( success rate + speed + TTFB + intelligence for smart mode, success rate + speed in balanced mode). ### FR-4: Update Sticky Session to New Model On successful fallback, the sticky session must be updated to point to the new fallback model and `modelDbId` + `keyId`. The sticky key feature should be cleared for the new model since since the fallback is not LongCat, since the sticky key preference only applies to LongCat sessions. ### FR-5: Never Route Session to LongCat Again Once a session is banned from LongCat, it must never be routed to LongCat again for the remainder of that session's lifetime (30 min TTL). This means: - The `stickySessionMap` entry must include a `bannedPlatforms` field (or `Set`) to track which platforms are banned for this session - On subsequent requests in the same session, `getStickyModel()` returns the preferred model, but the proxy layer must check if the session is banned from LongCat and skip LongCat models before calling `routeRequest()` - The LongCat smart-auto preference in `router.ts` must also be suppressed for banned sessions ( the router should not boost LongCat entries to the front for sessions that are banned from LongCat ### FR-6: Truncated Response Detection When a LongCat streaming response is received, the proxy must check the response content for signs of truncation. If detected, the session must be banned from LongCat immediately, even if the stream has already started (headers sent, the client has already received partial data). The system must: - Log the truncation detection - Record the ban in the sticky session - Add all LongCat model IDs to `skipModels` - End stream and ban for future requests - The client receives the truncated partial response as-is; future requests in this session will route to non-LongCat models ### FR-7: Auth Error Handling for LongCat Sessions When an auth error (401/403) occurs on a LongCat sticky session: - Clear the sticky key via `clearStickyKey()` (existing behavior) - Additionally ban the LongCat platform for this session via the new `banPlatformFromSession()` function - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success ### FR-8: Rate-Limit Error Handling for LongCat Sessions When a rate-limit error (429) occurs on a LongCat sticky session: - Ban the LongCat platform for this session via `banPlatformFromSession()` - Add all LongCat model IDs to `skipModels` - Set `preferredKeyId` to `undefined` - On retry, fall through to the next best non-LongCat model - Update sticky session to the new model on success - Note: rate-limit errors on LongCat do NOT clear the entire sticky session (the session may still work with a different key on a different model). Only ban LongCat specifically. ### FR-9: Existing Behavior Preserved for Non-LongCat Sessions All existing sticky session behavior for non-LongCat providers must remain unchanged. The new ban mechanism only applies exclusively to LongCat sessions. Non-LongCat sessions that never have platform bans. ### FR-10: Session Expiry Clears Bans When a sticky session expires ( via TTL (30 min), the `bannedPlatforms` set is also cleared. This is natural — expired sessions are evicted from the `stickySessionMap` entirely, including all associated data. ### FR-11: No Database Schema Changes The ban mechanism is purely in-memory, using the existing `stickySessionMap`. No database schema changes are required. ### FR-12: Minimal Router Changes The router (`server/src/services/router.ts`) should not need significant changes. The only change is that the LongCat smart-auto preference logic should skip sessions that are banned from LongCat. The proxy layer handles all ban detection and session management. The router remains provider-agnostic. ### FR-13: No UI Changes This is a backend-only feature. No client-side changes are needed. ## Non-Functional Requirements ### NFR-1: Backward Compatibility Existing sessions without `bannedPlatforms` (from before this feature or for non-LongCat providers) must continue to work. The `bannedPlatforms` field must be optional in the sticky session map value type. ### NFR-2: Thread Safety The existing `stickySessionMap` is a plain `Map` with no locking ( single-threaded Node.js). The extended map follows the same pattern — no additional concurrency concerns. ### NFR-3: Minimal Performance Impact The ban check adds one `Set` lookup per one optional field check in the sticky session map entry per one DB query to check if a model is on a banned platform. No additional I/O beyond what already exists. ### NFR-4: Test Coverage New unit tests must cover: - Multiple key use detection (auth + rate limit + truncated) - Session ban persistence across retries - Fallback to next best model - Sticky session update on success - Session expiry clearing bans - Non-LongCat sessions unaffected ## Files Requiring Modification | # | File | Change Type | Description | |---|---|---|---| | | 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:16) | Edit | Extend `stickySessionMap` value type to include `bannedPlatforms` | | | 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:34-52) | Edit | Add `isSessionBannedFromPlatform()` helper function | diff --git a/.roo/specs/provider-5xx-session-ban/design.md b/.roo/specs/provider-5xx-session-ban/design.md new file mode 100644 index 00000000..889c0252 --- /dev/null +++ b/.roo/specs/provider-5xx-session-ban/design.md @@ -0,0 +1,540 @@ +# Design: Provider 5xx Session Ban + +## Architecture Overview + +The ban mechanism extends the existing sticky session infrastructure in `proxy.ts`. The router (`router.ts`) requires **no changes** — the existing `skipModels` mechanism handles routing around banned providers. All ban detection, consecutive failure tracking, and session management happens in the proxy layer. + +There are **two independent ban triggers** that both use the same `bannedPlatforms` infrastructure: +1. **5xx consecutive failure ban** — 2 consecutive 5xx errors from the same provider +2. **Truncation detection ban** — a truncated response from any provider (200 with incomplete content) + +```mermaid +graph TD + subgraph Proxy [proxy.ts] + SSM[stickySessionMap
key → modelDbId + keyId + bannedPlatforms + consecutiveFailures + lastUsed] + GSF[recordConsecutiveFailure] + RSF[resetConsecutiveFailures] + RSA[resetAllConsecutiveFailures] + BPS[banPlatformFromSession] + GSB[isSessionBannedFromPlatform] + APS[addProviderModelsToSkipModels] + ITR[isTruncatedResponse
retained for all providers] + end + + subgraph RetryLoop [handleChatCompletion retry loop] + DET[5xx Error Detection] + INC[Increment Counter] + CHK{Count >= 2?} + BAN5[Ban Provider via 5xx] + TRUNC[Truncation Detection
any provider] + BANT[Ban Provider via Truncation] + SKIP[Add skipModels] + FALL[Fallback Routing] + SUC[Success] + RST[Reset Counter] + end + + SSM --> GSF + SSM --> RSF + SSM --> RSA + SSM --> ITR + DET --> INC + INC --> CHK + CHK -->|Yes| BAN5 + CHK -->|No| FALL + BAN5 --> SSM + BAN5 --> SKIP + TRUNC --> BANT + BANT --> SSM + BANT --> SKIP + SUC --> RST + RST --> SSM +``` + +## Data Model Changes + +### Sticky Session Map Value Type + +Current value type at [`proxy.ts:16`](../server/src/routes/proxy.ts:16): +```typescript +{ + modelDbId: number; + keyId?: number; + bannedPlatforms?: Set; + lastUsed: number; +} +``` + +Extended to: +```typescript +{ + modelDbId: number; + keyId?: number; + bannedPlatforms?: Set; + consecutiveFailures?: Map; // provider → count + lastUsed: number; +} +``` + +The `consecutiveFailures` field is optional for backward compatibility. Existing entries without it default to `undefined` (no tracked failures). The map is keyed by provider name (e.g. `'longcat'`, `'groq'`, `'openrouter'`). + +## New Functions + +### 1. `recordConsecutiveFailure()` — [`proxy.ts`](../server/src/routes/proxy.ts) + +Increments the consecutive failure counter for a provider within a sticky session. If the threshold (2) is reached, bans the provider and adds all its models to `skipModels`. + +```typescript +function recordConsecutiveFailure( + messages: ChatMessage[], + routingMode: RoutingMode, + provider: string, + skipModels: Set, + modelDbId?: number, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + let entry = stickySessionMap.get(key); + if (!entry) { + if (modelDbId === undefined) return; + entry = { modelDbId, lastUsed: Date.now() }; + stickySessionMap.set(key, entry); + } + if (!entry.consecutiveFailures) entry.consecutiveFailures = new Map(); + const current = entry.consecutiveFailures.get(provider) ?? 0; + const count = current + 1; + entry.consecutiveFailures.set(provider, count); + entry.lastUsed = Date.now(); + console.log(`[Sticky] consecutive 5xx for ${provider}: ${count}/2 session=${key.slice(0, 8)}`); + + if (count >= 2) { + // Ban the provider + if (!entry.bannedPlatforms) entry.bannedPlatforms = new Set(); + entry.bannedPlatforms.add(provider); + console.log(`[Sticky] banned platform=${provider} for session=${key.slice(0, 8)} | consecutive 5xx count=${count}`); + // Add all models of this provider to skipModels + addProviderModelsToSkipModels(skipModels, provider); + // Clear consecutive failures for this provider (ban is now in effect) + entry.consecutiveFailures.delete(provider); + } +} +``` + +### 2. `resetConsecutiveFailures()` — [`proxy.ts`](../server/src/routes/proxy.ts) + +Resets the consecutive failure counter for a specific provider. Called when that provider succeeds. + +```typescript +function resetConsecutiveFailures( + messages: ChatMessage[], + routingMode: RoutingMode, + provider: string, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (!entry.consecutiveFailures) return; + if (entry.consecutiveFailures.has(provider)) { + entry.consecutiveFailures.delete(provider); + console.log(`[Sticky] reset consecutive failures for ${provider} session=${key.slice(0, 8)}`); + } +} +``` + +### 3. `resetAllConsecutiveFailures()` — [`proxy.ts`](../server/src/routes/proxy.ts) + +Resets all consecutive failure counters. Called on any successful response to clear stale counters for other providers. + +```typescript +function resetAllConsecutiveFailures( + messages: ChatMessage[], + routingMode: RoutingMode, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (entry.consecutiveFailures && entry.consecutiveFailures.size > 0) { + entry.consecutiveFailures.clear(); + console.log(`[Sticky] reset all consecutive failures session=${key.slice(0, 8)}`); + } +} +``` + +### 4. `addProviderModelsToSkipModels()` — [`proxy.ts`](../server/src/routes/proxy.ts) + +Generic version of `addLongcatModelsToSkipModels()`. Queries the DB for all enabled models of a given provider and adds them to the `skipModels` set. + +```typescript +function addProviderModelsToSkipModels( + skipModels: Set, + provider: string, +): void { + const db = getDb(); + const models = db.prepare( + 'SELECT id FROM models WHERE platform = ? AND enabled = 1' + ).all(provider) as Array<{ id: number }>; + for (const m of models) { + skipModels.add(m.id); + } + console.log(`[Sticky] added ${models.length} ${provider} model(s) to skipModels: [${models.map(m => m.id).join(',')}]`); +} +``` + +## Component Changes + +### 1. Sticky Session Map Type — [`proxy.ts:16`](../server/src/routes/proxy.ts:16) + +```typescript +const stickySessionMap = new Map; + consecutiveFailures?: Map; + lastUsed: number; +}>(); +``` + +### 2. Exports Update — [`proxy.ts:146-157`](../server/src/routes/proxy.ts:146-157) + +Add new functions to the exported block for testing: + +```typescript +export { + isSessionBannedFromPlatform, + banPlatformFromSession, + addProviderModelsToSkipModels, // renamed from addLongcatModelsToSkipModels + recordConsecutiveFailure, + resetConsecutiveFailures, + resetAllConsecutiveFailures, + isTruncatedResponse, // retained, generalized to all providers + getSessionKey, + getStickyModel, + getStickyKey, + setStickyModel, + clearStickyModel, + stickySessionMap, +}; +``` + +### 3. Pre-routing Ban Check — [`proxy.ts:1138-1152`](../server/src/routes/proxy.ts:1138-1152) + +Generalize from LongCat-only to any banned platform. Instead of hardcoding `'longcat'`, check the platform of the `preferredModel` dynamically: + +```typescript +// Check if session is banned from the preferred model's platform +const skipModels = new Set(); +if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow && isSessionBannedFromPlatform(normalizedMessages, routingMode, prefRow.platform)) { + addProviderModelsToSkipModels(skipModels, prefRow.platform); + console.log(`[Sticky] skipping preferredModel=${preferredModel} (${prefRow.platform} banned for session)`); + preferredModel = undefined; + preferredKeyId = undefined; + } +} +``` + +### 4. Error Handling in Retry Loop — [`proxy.ts:1378-1425`](../server/src/routes/proxy.ts:1378-1425) + +Replace the LongCat-specific ban logic (lines 1383-1402) with general 5xx consecutive failure detection: + +```typescript +} catch (err: any) { + const latency = Date.now() - start; + logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, 0, latency, null, err.message); + + // General 5xx consecutive failure detection (replaces LongCat-specific ban logic) + const errStatus = getErrorStatus(err); + if (errStatus && errStatus >= 500 && errStatus < 600) { + recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); + // If this provider was just banned, clear preferredModel/preferredKeyId if they point to it + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === route.platform) { + preferredModel = undefined; + preferredKeyId = undefined; + } + } + } + + if (isRetryableError(err)) { + const skipId = `${route.platform}:${route.modelId}:${route.keyId}`; + skipKeys.add(skipId); + if (shouldSkipModelOnRetry(err)) { + skipModels.add(route.modelDbId); + } + if (isRateLimitError(err)) { + setCooldown(route.platform, route.modelId, route.keyId, 120_000); + } + // Auth errors (401/403): clear the sticky key for this session + if (isAuthError(err)) { + console.warn(`[Proxy] auth error ${errStatus} from ${route.displayName}/${route.modelId}, clearing sticky key for session`); + clearStickyKey(normalizedMessages, routingMode); + preferredKeyId = undefined; + } + lastError = err; + console.warn(`[Proxy] retryable ${summarizeProviderError(err)} from ${route.displayName}/${route.modelId}, fallback (attempt ${attempt + 1}/${MAX_RETRIES})`); + continue; + } + + // Non-retryable error + clearStickyModel(normalizedMessages, routingMode); + res.status(502).json({ ... }); + return; +} +``` + +### 5. Success Path Counter Reset — [`proxy.ts:1289-1293`](../server/src/routes/proxy.ts:1289-1293) and [`proxy.ts:1360-1362`](../server/src/routes/proxy.ts:1360-1362) + +After a successful response (both streaming and non-streaming), reset consecutive failure counters: + +```typescript +// Streaming success path (after line 1291, before logRequest) +recordTokens(route.platform, route.modelId, route.keyId, estimatedInputTokens + totalOutputTokens); +recordSuccess(route.modelDbId); +setStickyModel(normalizedMessages, route.modelDbId, routingMode, route.keyId); +resetAllConsecutiveFailures(normalizedMessages, routingMode); // NEW +logRequest(route.platform, route.modelId, 'success', ...); +return; + +// Non-streaming success path (after line 1361, before res.json) +recordTokens(route.platform, route.modelId, route.keyId, totalTokens); +recordSuccess(route.modelDbId); +setStickyModel(normalizedMessages, route.modelDbId, routingMode, route.keyId); +resetAllConsecutiveFailures(normalizedMessages, routingMode); // NEW +res.json(responseBody); +``` + +### 6. Mid-Stream Error Handling — [`proxy.ts:1294-1346`](../server/src/routes/proxy.ts:1294-1346) + +Replace the LongCat-specific mid-stream truncation handling with generalized truncation detection for all providers, plus 5xx consecutive failure detection: + +```typescript +} catch (streamErr: any) { + if (streamStarted) { + // General 5xx consecutive failure detection for mid-stream errors + const streamErrStatus = getErrorStatus(streamErr); + if (streamErrStatus && streamErrStatus >= 500 && streamErrStatus < 600) { + recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); + } + + // Generalized truncation detection for any provider (not just LongCat) + // Check if the stream was truncated mid-stream (e.g., incomplete content before error) + const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; + if (isTruncatedResponse(streamTextToCheck)) { + console.warn(`[Proxy] Truncated stream content detected from ${route.platform} — banning ${route.platform} for session`); + banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); + } + + // Existing mid-stream error handling (send error SSE event) + console.error(`[Proxy] Mid-stream error from ${route.displayName}:`, streamErr.message); + const payload = { error: { message: `Provider error (${route.displayName}): stream interrupted`, type: 'stream_error' } }; + try { + if (responseStreamContext) { + writeResponseStreamEvent(res, { + type: 'response.failed', + response: { + id: responseStreamContext.responseId, + status: 'failed', + error: payload.error, + }, + }); + } else { + res.write(`data: ${JSON.stringify(payload)}\n\n`); + res.write('data: [DONE]\n\n'); + } + res.end(); + } catch { /* socket gone */ } + logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, totalOutputTokens, Date.now() - start, ttfbMs, streamErr.message); + return; + } + // Pre-stream error — bubble to outer retry/502 handler. + throw streamErr; +} +``` + +### 7. Generalize Post-Stream Truncation Detection — [`proxy.ts:1236-1242`](../server/src/routes/proxy.ts:1236-1242) + +Generalize the post-stream truncation check from LongCat-only to any provider: + +```typescript +// BEFORE (LongCat-only): +// if (route.platform === 'longcat') { +// const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; +// if (isTruncatedResponse(streamTextToCheck)) { +// console.warn(`[Proxy] LongCat truncated stream content detected — banning longcat for session`); +// banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); +// } +// } + +// AFTER (any provider): +const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; +if (isTruncatedResponse(streamTextToCheck)) { + console.warn(`[Proxy] Truncated stream content detected from ${route.platform} — banning ${route.platform} for session`); + banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); +} +``` + +### 8. Retain `isTruncatedResponse()` Function — [`proxy.ts:128-143`](../server/src/routes/proxy.ts:128-143) + +The `isTruncatedResponse()` function is **retained** and used for all providers. No changes needed to the function itself — it checks response content for truncation patterns regardless of provider. + +### 9. Remove `addLongcatModelsToSkipModels()` Function — [`proxy.ts:117-126`](../server/src/routes/proxy.ts:117-126) + +Replace with the generic `addProviderModelsToSkipModels()`. + +## Consecutive Failure Tracking Flow + +### 5xx Error Path + +```mermaid +flowchart TD + REQ[Request to Provider A] --> ERR{5xx Error?} + ERR -->|No| SUCCESS[Success → Reset All Counters] + SUCCESS --> NEXT[Next Request] + ERR -->|Yes| INC[Increment Counter for Provider A] + INC --> LOG1[Log: consecutive 5xx count] + LOG1 --> THRESH{Count >= 2?} + THRESH -->|No| RETRY[Continue Retry Loop] + RETRY --> NEXT + THRESH -->|Yes| BAN[Ban Provider A] + BAN --> SKIP[Add All Provider A Models to skipModels] + BAN --> CLEAR[Clear preferredModel/preferredKeyId if Provider A] + BAN --> LOG2[Log: Provider A banned for session] + CLEAR --> RETRY2[Retry with Next Best Model] + RETRY2 --> NEXT +``` + +### Success Path + +```mermaid +flowchart TD + REQ[Request to Provider A] --> SUC{Success?} + SUC -->|Yes| RESET[Reset All Consecutive Failure Counters] + RESET --> STICKY[Update Sticky Model to Provider A] + STICKY --> LOG[Log: counters reset] + SUC -->|No| ERR{5xx Error?} + ERR -->|Yes| INC[Increment Counter for Provider A] + INC --> RETRY{Count >= 2?} + RETRY -->|No| FALLBACK[Retry Next Model] + RETRY -->|Yes| BAN[Ban Provider A → Fallback] +``` + +### Truncation Detection Path + +```mermaid +flowchart TD + REQ[Request to Any Provider] --> COMPLETE{Stream/Response Complete?} + COMPLETE -->|Yes| CHECK[Check isTruncatedResponse] + CHECK --> TRUNC{Truncated?} + TRUNC -->|No| DONE[Continue Normally] + TRUNC -->|Yes| BAN[Ban Provider via banPlatformFromSession] + BAN --> SKIP[Add All Provider Models to skipModels] + BAN --> LOG[Log: Provider banned for truncation] + SKIP --> FALLBACK[Fallback to Next Provider] + COMPLETE -->|Mid-Stream Error| MCHECK[Check isTruncatedResponse on partial content] + MCHECK --> MTRUNC{Truncated?} + MTRUNC -->|Yes| MBAN[Ban Provider via banPlatformFromSession] + MTRUNC -->|No| MERR[Handle as Normal Error] + MBAN --> MLOG[Log: Provider banned for truncation mid-stream] +``` + +## Session Lifecycle + +```mermaid +sequenceDiagram + participant Client + participant Proxy + participant Router + participant ProviderA + participant ProviderB + + Note over Client,Proxy: Request 1 - auto-smart routing + Client->>Proxy: POST /v1/chat/completions + Proxy->>Router: routeRequest - preferredModel=undefined - skipModels=empty + Router->>Proxy: RouteResult - ProviderA - keyId=1 + Proxy->>ProviderA: POST /chat/completions - key1 + ProviderA->>Proxy: 200 - response + Proxy->>Client: 200 - response + Proxy->>Proxy: setStickyModel - modelDbId=PA - keyId=1 + Proxy->>Proxy: resetAllConsecutiveFailures + + Note over Client,Proxy: Request 2 - same session - ProviderA returns 503 + Client->>Proxy: POST /v1/chat/completions + Proxy->>Proxy: getStickyModel - modelDbId=PA + Proxy->>Proxy: isSessionBannedFromPlatform - ProviderA? NO + Proxy->>Router: routeRequest - preferredModel=PA - preferredKeyId=1 + Router->>Proxy: RouteResult - ProviderA - keyId=1 + Proxy->>ProviderA: POST /chat/completions - key1 + ProviderA->>Proxy: 503 - Service Unavailable + Proxy->>Proxy: recordConsecutiveFailure - ProviderA count=1 + Proxy->>Router: routeRequest - preferredModel=PA - skipModels=empty + Router->>Proxy: RouteResult - ProviderA - keyId=2 + Proxy->>ProviderA: POST /chat/completions - key2 + ProviderA->>Proxy: 502 - Bad Gateway + Proxy->>Proxy: recordConsecutiveFailure - ProviderA count=2 → BAN + Proxy->>Proxy: addProviderModelsToSkipModels - ProviderA + Proxy->>Proxy: preferredModel=undefined - preferredKeyId=undefined + Proxy->>Router: routeRequest - skipModels=ProviderA models + Router->>Proxy: RouteResult - ProviderB - keyId=3 + ProviderB->>Proxy: 200 - response + Proxy->>Client: 200 - response + Proxy->>Proxy: setStickyModel - modelDbId=PB - keyId=3 + Proxy->>Proxy: resetAllConsecutiveFailures + + Note over Client,Proxy: Request 3 - same session - ProviderA still banned + Client->>Proxy: POST /v1/chat/completions + Proxy->>Proxy: getStickyModel - modelDbId=PB + Proxy->>Proxy: isSessionBannedFromPlatform - ProviderA? YES + Proxy->>Proxy: addProviderModelsToSkipModels - ProviderA + Proxy->>Router: routeRequest - preferredModel=PB - skipModels=ProviderA models + Router->>Proxy: RouteResult - ProviderB - keyId=3 + ProviderB->>Proxy: 200 - response + Proxy->>Client: 200 - response +``` + +## Edge Cases + +### EC-1: Provider Has Only One Model +When a provider has only one model and it gets banned, `addProviderModelsToSkipModels()` adds that single model to `skipModels`. The retry loop continues to the next provider. No special handling needed. + +### EC-2: Provider Has Multiple Models +When a provider has multiple models (e.g., `longcat-2.0-preview` and `longcat-3.0`), `addProviderModelsToSkipModels()` adds ALL enabled model IDs for that provider to `skipModels`. This ensures the session is banned from ALL models of that provider, not just the one that failed. + +### EC-3: All Providers Banned +If all providers become banned for a session, the retry loop exhausts all options. The `routeRequest()` call throws when no models are available, and the existing error handling returns a 502/429 to the client with "All fallback attempts failed". The sticky session entry remains but all providers are banned. When the session TTL expires (30 min), the entry is evicted and the session starts fresh. + +### EC-4: Session Expiry Clears Everything +When a sticky session expires via TTL (30 min), the entire entry is deleted from `stickySessionMap`, including `bannedPlatforms` and `consecutiveFailures`. This is natural — expired sessions are evicted entirely. + +### EC-5: Non-Sticky Sessions +For non-sticky sessions (no first user message, or routing mode that doesn't produce a session key), no consecutive failure tracking or ban logic applies. The existing retry loop behavior is unchanged. + +### EC-6: Concurrent Requests in Same Session +If two concurrent requests in the same session both receive 5xx errors from the same provider, the counter may increment to 2 and trigger a ban. This is correct behavior — the provider is clearly having issues. The `stickySessionMap` is a standard JavaScript `Map`, and Node.js is single-threaded, so there are no race conditions. + +### EC-7: Counter Reset on Provider Change +If Provider A has 1 consecutive failure and the retry loop routes to Provider B which succeeds, the counter for Provider A is reset to 0 (via `resetAllConsecutiveFailures()`). This is intentional — the session is now working on Provider B, and Provider A's previous failure may have been transient. + +### EC-8: 5xx Followed by Non-5xx Error +If a provider returns a 5xx error (counter increments to 1) followed by a non-retryable 4xx error, the existing non-retryable error handling clears the entire sticky model via `clearStickyModel()`. This removes the consecutive failure counter as well since the entire entry is deleted. + +### EC-9: Mid-Stream 5xx After Pre-Stream 5xx +If the first attempt gets a pre-stream 5xx error (counter=1) and the retry's stream gets a mid-stream 5xx error (counter=2), the provider is banned. The mid-stream error path calls `recordConsecutiveFailure()` which increments and triggers the ban. The stream is ended with an error SSE event (existing behavior), and subsequent requests in the session will skip this provider. + +### EC-10: Truncation from Any Provider Triggers Ban +If any provider (not just LongCat) returns a truncated response (e.g., a 200 with incomplete content that matches truncation patterns), the `isTruncatedResponse()` check triggers a ban via `banPlatformFromSession()`. This works for post-stream completion checks and mid-stream error checks. The truncation ban is independent of the 5xx consecutive failure ban — a single truncation is enough to ban, whereas 5xx errors require 2 consecutive failures. + +### EC-11: Truncation and 5xx Ban Are Independent +A provider can be banned via either mechanism independently. For example, if Provider A has 1 consecutive 5xx failure and then returns a truncated response, it gets banned via truncation (not 5xx). Conversely, if Provider A is banned via truncation and later returns 5xx errors, the 5xx counter still increments (though the provider is already banned, so the counter has no additional effect until the session expires). + +## Files to Modify + +| File | Change | +|---|---| +| `server/src/routes/proxy.ts` | Main implementation — add new functions, update retry loop, generalize LongCat-specific truncation detection to all providers, remove LongCat-specific auth/rate-limit ban logic | +| `server/src/__tests__/routes/longcat-session-ban.test.ts` | Rename to `provider-session-ban.test.ts`, update tests to cover general provider ban and truncation detection for any provider | diff --git a/.roo/specs/provider-5xx-session-ban/requirements.md b/.roo/specs/provider-5xx-session-ban/requirements.md new file mode 100644 index 00000000..e04d9abc --- /dev/null +++ b/.roo/specs/provider-5xx-session-ban/requirements.md @@ -0,0 +1,60 @@ +# Requirements: Provider 5xx Session Ban + +## Overview + +This spec generalizes **two** existing LongCat mechanisms to work for all providers: + +1. **5xx consecutive failure ban** (new, general): When a sticky session receives **2 consecutive 5xx errors (500, 502, 503, 504)** from the same provider, ban that provider for the session. The session falls back to the next best model via normal routing. The ban lasts for the session TTL (30 minutes). +2. **Truncation detection ban** (existing, now generalized): When a truncated response is detected from **any** provider (not just LongCat), ban that provider for the session using the same `banPlatformFromSession()` mechanism. A truncated response can come back as a 200 with incomplete content — this is independent of 5xx errors. + +These are **two independent triggers** that both use the same underlying `bannedPlatforms` infrastructure. A provider can be banned either for 2 consecutive 5xx errors OR for a truncated response. Both mechanisms work for ALL providers. + +The LongCat-specific auth error and rate limit ban logic is removed and replaced by the general 5xx consecutive failure mechanism. + +## Context + +The existing sticky sessions feature lives in [`server/src/routes/proxy.ts`](../server/src/routes/proxy.ts:16). It uses an SHA-1 hash of `routingMode + firstUserMessage` to identify sessions, and stores `{ modelDbId, keyId?, bannedPlatforms?, lastUsed }` with a 30-min TTL and 500-entry max. + +The existing LongCat session ban ([`longcat-session-ban` spec](../longcat-session-ban/)) added `bannedPlatforms`, `banPlatformFromSession()`, `isSessionBannedFromPlatform()`, `addLongcatModelsToSkipModels()`, and `isTruncatedResponse()`. This spec generalizes that infrastructure: the `bannedPlatforms` set and ban helper functions are reused, the `isTruncatedResponse()` function is retained and generalized to all providers, and the LongCat-specific auth/rate-limit error detection is replaced by general 5xx consecutive failure tracking. + +The retry loop in `handleChatCompletion()` currently has LongCat-specific error handling at lines 1383-1402 that bans LongCat on auth errors and rate-limit errors. This is replaced by general 5xx consecutive failure detection that works for any provider. The truncation detection is retained but generalized from LongCat-only to all providers. + +## Functional Requirements + +| ID | Requirement | Priority | +|---|---|---| +| FR-1 | Detect 5xx errors (500, 502, 503, 504) from any provider during the retry loop. Detection uses the existing `getErrorStatus()` helper to check the HTTP status code. | Must | +| FR-2 | Track consecutive 5xx failures per provider within a sticky session. The counter is stored in a new `consecutiveFailures: Map` field in the sticky session entry, keyed by provider name. The counter resets on success or when a different provider is used. | Must | +| FR-3 | Ban the provider after **2 consecutive** 5xx failures within the same session. The ban is recorded by adding the provider to the existing `bannedPlatforms` set. | Must | +| FR-4 | On ban, add all models of that provider to `skipModels` for the current retry loop. This uses a new generic `addProviderModelsToSkipModels()` function that queries the DB for all enabled models of the given provider. | Must | +| FR-5 | On ban, clear `preferredModel` and `preferredKeyId` if they point to the banned provider, so the router picks the next best model via normal routing. | Must | +| FR-6 | Ban persists for the session TTL (30 minutes, same as `STICKY_TTL_MS`). No separate TTL — the existing sticky session expiry clears everything including bans and consecutive failure counters. | Must | +| FR-7 | Ban is stored in the existing `bannedPlatforms` Set in `stickySessionMap` (reuses existing infrastructure). The `banPlatformFromSession()` and `isSessionBannedFromPlatform()` functions are reused as-is. | Must | +| FR-8 | Mid-stream 5xx errors also count toward the consecutive failure counter. When a mid-stream error has a 5xx status, it increments the counter and triggers a ban if the threshold is reached. The stream is still ended gracefully (existing behavior). | Must | +| FR-9 | The consecutive counter resets when a different provider succeeds. When a successful response comes from provider B, the consecutive failure counter for provider A is reset to 0. | Must | +| FR-10 | The consecutive counter resets when the same provider succeeds. When a successful response comes from provider A, all consecutive failure counters for that provider are reset to 0. | Must | +| FR-11 | Remove the LongCat-specific **auth error** ban logic (lines 1384-1389) and **rate-limit error** ban logic (lines 1390-1395). The general 5xx consecutive failure mechanism supersedes these. Truncation detection is NOT removed — it is generalized to all providers (see FR-14, FR-15). | Must | +| FR-12 | Logging — log when a provider is banned with the failure count and session key. Log when consecutive failure counter is incremented. Log when counter is reset on success. Log when a provider is banned due to truncation. | Should | +| FR-13 | The ban check in pre-routing (lines 1141-1152) should check for any banned platform, not just LongCat. The existing `isSessionBannedFromPlatform()` call currently hardcodes `'longcat'` — it should check the platform of the `preferredModel` dynamically. | Must | +| FR-14 | Truncation detection applies to **ALL** providers, not just LongCat. When a truncated response is detected from any provider (after stream completes or mid-stream), ban that provider for the session using the same `banPlatformFromSession()` mechanism. The post-stream truncation check (lines 1236-1242) and mid-stream truncation handling (lines 1297-1318) are generalized from `route.platform === 'longcat'` to any platform. | Must | +| FR-15 | The `isTruncatedResponse()` function is retained and generalized — it checks response content/error messages from any provider for truncation patterns. It is NOT removed. | Must | + +## Non-Functional Requirements + +| ID | Requirement | +|---|---| +| NFR-1 | No changes to the router (`router.ts`). The existing `skipModels` mechanism handles routing around banned providers. | +| NFR-2 | No changes to the database schema. All state is in-memory in `stickySessionMap`. | +| NFR-3 | Backward compatible: existing sticky session entries without `consecutiveFailures` field default to no tracked failures. | +| NFR-4 | Non-sticky sessions are unaffected. The consecutive failure tracking only applies when a sticky session exists. | + +## Out of Scope + +- Persistent bans across server restarts (in-memory only, same as existing sticky sessions) +- Configurable threshold for 5xx ban (hardcoded to 2 consecutive failures) +- Configurable threshold for truncation ban (any single truncation triggers ban) +- Bans for non-5xx errors (4xx client errors do not trigger bans) +- Changes to the Thompson Sampling algorithm +- Changes to rate limiting logic +- Client-side UI changes +- Configuration UI for enabling/disabling bans per provider diff --git a/.roo/specs/provider-5xx-session-ban/tasks.md b/.roo/specs/provider-5xx-session-ban/tasks.md new file mode 100644 index 00000000..f01ee8ab --- /dev/null +++ b/.roo/specs/provider-5xx-session-ban/tasks.md @@ -0,0 +1,130 @@ +# Tasks: Provider 5xx Session Ban + +## Implementation Steps + +- [x] 1. Extend `stickySessionMap` value type in `proxy.ts` + - Edit line 16: add `consecutiveFailures?: Map` to the map value type + - This is the foundational type change — all other changes depend on it + +- [x] 2. Add `recordConsecutiveFailure()` function in `proxy.ts` + - Add after `banPlatformFromSession()` (after line 115) + - Parameters: `messages`, `routingMode`, `provider`, `skipModels`, `modelDbId?` + - Creates sticky entry if needed (when `modelDbId` is provided) + - Increments `consecutiveFailures` counter for the provider + - Logs the current count (e.g., `consecutive 5xx for ${provider}: ${count}/2`) + - If count >= 2: calls `banPlatformFromSession()` logic inline (adds to `bannedPlatforms`, calls `addProviderModelsToSkipModels()`, deletes the consecutive failure entry) + - Refreshes `lastUsed` TTL + +- [x] 3. Add `resetConsecutiveFailures()` function in `proxy.ts` + - Add after `recordConsecutiveFailure()` + - Parameters: `messages`, `routingMode`, `provider` + - Deletes the provider's entry from `consecutiveFailures` map if it exists + - Logs the reset + +- [x] 4. Add `resetAllConsecutiveFailures()` function in `proxy.ts` + - Add after `resetConsecutiveFailures()` + - Parameters: `messages`, `routingMode` + - Clears the entire `consecutiveFailures` map if it has entries + - Logs the reset + +- [x] 5. Add `addProviderModelsToSkipModels()` function in `proxy.ts` + - Replace the existing `addLongcatModelsToSkipModels()` function (lines 117-126) + - Parameters: `skipModels: Set`, `provider: string` + - Queries DB: `SELECT id FROM models WHERE platform = ? AND enabled = 1` with the provider parameter + - Adds each model ID to `skipModels` + - Logs count and IDs + +- [x] 6. Update `getStickyKey()` to check `bannedPlatforms` for any platform + - In `getStickyKey()` (lines 54-79), after the TTL check, add: + - Look up the sticky model's platform via DB query: `SELECT platform FROM models WHERE id = ?` + - If the model's platform is in `entry.bannedPlatforms`, return `undefined` + - Log: `[Sticky] key skipped session=... | model platform=... is banned` + - This generalizes the existing LongCat-specific check that was in the pre-routing section + +- [x] 7. Update pre-routing ban check in `handleChatCompletion()` + - Replace the LongCat-specific ban check (lines 1138-1152) with a generic version: + - Instead of hardcoding `'longcat'`, look up the `preferredModel`'s platform dynamically + - If `preferredModel` exists, query its platform from the DB + - If the session is banned from that platform, add all its models to `skipModels` and clear `preferredModel`/`preferredKeyId` + - This handles the case where the sticky model points to a banned provider + +- [x] 8. Replace LongCat-specific auth/rate-limit error handling with general 5xx consecutive failure detection + - In the `catch (err)` block (lines 1383-1402), remove the LongCat-specific auth error ban (lines 1384-1389) and rate-limit error ban (lines 1390-1395) + - Replace with: check if `getErrorStatus(err)` is a 5xx status (500-509) + - If 5xx: call `recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId)` + - If the provider was just banned (check `isSessionBannedFromPlatform`), clear `preferredModel` and `preferredKeyId` if they point to the banned provider + - Keep the existing `isAuthError()` handling for non-LongCat auth errors (clear sticky key) + - Keep the existing `isRetryableError()` / `shouldSkipModelOnRetry()` logic + +- [x] 9. Add success path counter reset + - In the streaming success path (after `setStickyModel()` around line 1291): add `resetAllConsecutiveFailures(normalizedMessages, routingMode)` + - In the non-streaming success path (after `setStickyModel()` around line 1362): add `resetAllConsecutiveFailures(normalizedMessages, routingMode)` + - This ensures that a successful response clears any accumulated failure counters + +- [x] 10. Update mid-stream error handling with consecutive failure tracking and generalized truncation detection + - In the `catch (streamErr)` block for mid-stream errors (around lines 1294-1346): + - Remove the LongCat-specific truncation handling (lines 1297-1318) + - Add: check if `getErrorStatus(streamErr)` is a 5xx status + - If 5xx: call `recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId)` + - Add: generalized truncation detection for any provider using `isTruncatedResponse()` on partial stream content + - If truncated: call `banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId)` + - Keep the existing mid-stream error SSE event behavior (send error event + return) + +- [x] 11. Generalize post-stream truncation detection to all providers + - Update the post-stream truncation check (lines 1236-1242): + - Remove the `route.platform === 'longcat'` guard + - Apply `isTruncatedResponse()` check to any provider + - Update log message to use `route.platform` instead of hardcoded `'longcat'` + - The `isTruncatedResponse()` function itself is NOT modified — it already checks content patterns regardless of provider + +- [x] 12. Remove `addLongcatModelsToSkipModels()` function + - Already replaced by `addProviderModelsToSkipModels()` in step 5 + - Remove it from the exports block + +- [x] 13. Update exports block + - In the exports block (lines 146-157): + - Remove: `addLongcatModelsToSkipModels` + - Keep: `isTruncatedResponse` (retained for all providers) + - Add: `addProviderModelsToSkipModels`, `recordConsecutiveFailure`, `resetConsecutiveFailures`, `resetAllConsecutiveFailures` + +- [x] 14. Update tests — rename and rewrite + - Rename `server/src/__tests__/routes/longcat-session-ban.test.ts` to `provider-session-ban.test.ts` + - Update imports: replace `addLongcatModelsToSkipModels` with `addProviderModelsToSkipModels` + - Keep `isTruncatedResponse` import and test suite (it is retained) + - Update `addLongcatModelsToSkipModels` tests to use `addProviderModelsToSkipModels` with a generic provider parameter + - Add test suite for `recordConsecutiveFailure()`: + - Increments counter on first 5xx + - Bans provider on second consecutive 5xx + - Adds provider models to skipModels on ban + - Creates sticky entry if modelDbId provided + - Does not create entry if no modelDbId and no existing entry + - Add test suite for `resetConsecutiveFailures()`: + - Resets counter for specific provider + - No-op if no sticky session + - No-op if provider has no counter + - Add test suite for `resetAllConsecutiveFailures()`: + - Clears all counters + - No-op if no sticky session + - No-op if no consecutive failures + - Add test suite for `addProviderModelsToSkipModels()`: + - Adds all models of given provider to skipModels + - Does not add models of other providers + - Handles empty model list gracefully + - Update integration tests to use generic provider names instead of hardcoded 'longcat' + - Add integration test: two consecutive 503 errors from same provider triggers ban + - Add integration test: success resets consecutive failure counter + - Add integration test: 5xx from provider A, success from provider B resets A's counter + - Add integration test: truncated response from any provider (not just LongCat) triggers ban + - Add integration test: mid-stream truncation from any provider triggers ban + +- [x] 15. TypeScript compilation check + - Run `npx tsc --noEmit` in the `server/` directory + - Ensure no type errors from the new `consecutiveFailures` field or new functions + - Ensure no errors from removed function (`addLongcatModelsToSkipModels`) + - Ensure no errors from retained function (`isTruncatedResponse`) + +- [x] 16. Run all tests + - Run `npm test` in the `server/` directory + - Verify no regressions in router tests, proxy tests, or sticky session behavior + - Verify all new provider ban tests pass + - Verify truncation detection tests pass for any provider diff --git a/server/src/__tests__/routes/longcat-session-ban.test.ts b/server/src/__tests__/routes/longcat-session-ban.test.ts deleted file mode 100644 index 1cc1ddeb..00000000 --- a/server/src/__tests__/routes/longcat-session-ban.test.ts +++ /dev/null @@ -1,265 +0,0 @@ -import { describe, it, expect, beforeAll, beforeEach, afterEach, vi } from 'vitest'; -import type { Express } from 'express'; -import { createApp } from '../../app.js'; -import { initDb, getDb, getUnifiedApiKey } from '../../db/index.js'; -import { - isSessionBannedFromPlatform, - banPlatformFromSession, - addLongcatModelsToSkipModels, - isTruncatedResponse, - getSessionKey, - getStickyModel, - setStickyModel, - stickySessionMap, -} from '../../routes/proxy.js'; - -function clearStickyMap() { - (stickySessionMap as Map).clear(); -} - -describe('LongCat session ban functionality', () => { - let app: Express; - - beforeAll(() => { - process.env.ENCRYPTION_KEY = '0'.repeat(64); - initDb(':memory:'); - app = createApp(); - }); - - beforeEach(() => { - clearStickyMap(); - const db = getDb(); - db.prepare('DELETE FROM api_keys').run(); - db.prepare('DELETE FROM requests').run(); - // Insert a dummy LongCat API key so routing can succeed if needed - db.prepare(`INSERT INTO api_keys (platform, label, encrypted_key, iv, auth_tag, status, enabled) - VALUES ('longcat', 'test', 'enc', 'iv', 'tag', 'healthy', 1)`).run(); - }); - - afterEach(() => { - vi.restoreAllMocks(); - }); - - // Helper to create a simple user message array - const makeMessages = (content: string) => [{ role: 'user' as const, content }]; - - // ---------- Test Suite 1: isSessionBannedFromPlatform ---------- - describe('isSessionBannedFromPlatform', () => { - it('returns false when no sticky session exists', () => { - const messages = makeMessages('Hello'); - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); - }); - - it('returns false when sticky session exists but no bannedPlatforms', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { modelDbId: 1, lastUsed: Date.now() }); - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); - }); - - it('returns true when the platform is in bannedPlatforms', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 1, - lastUsed: Date.now(), - bannedPlatforms: new Set(['longcat']), - }); - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); - }); - - it('returns false when a different platform is banned', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 1, - lastUsed: Date.now(), - bannedPlatforms: new Set(['groq']), - }); - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); - }); - - it('returns false when the sticky session has expired (past TTL)', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 1, - lastUsed: Date.now() - (31 * 60 * 1000), // 31 minutes ago - bannedPlatforms: new Set(['longcat']), - }); - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); - }); - }); - - // ---------- Test Suite 2: banPlatformFromSession ---------- - describe('banPlatformFromSession', () => { - it('does not create entry if none exists (only modifies existing)', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - expect(stickySessionMap.has(key)).toBe(false); - banPlatformFromSession(messages, 'balanced', 'longcat'); - expect(stickySessionMap.has(key)).toBe(false); - }); - - it('adds to existing bannedPlatforms if entry already exists', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 2, - lastUsed: Date.now(), - bannedPlatforms: new Set(['groq']), - }); - banPlatformFromSession(messages, 'balanced', 'longcat'); - const entry = stickySessionMap.get(key); - expect(entry.bannedPlatforms.has('groq')).toBe(true); - expect(entry.bannedPlatforms.has('longcat')).toBe(true); - }); - - it('does not duplicate platforms already banned', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 3, - lastUsed: Date.now(), - bannedPlatforms: new Set(['longcat']), - }); - const beforeSize = stickySessionMap.get(key).bannedPlatforms.size; - banPlatformFromSession(messages, 'balanced', 'longcat'); - const afterSize = stickySessionMap.get(key).bannedPlatforms.size; - expect(afterSize).toBe(beforeSize); - }); - - it('preserves existing modelDbId and keyId when banning', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - (stickySessionMap as Map).set(key, { - modelDbId: 42, - keyId: 7, - lastUsed: Date.now(), - }); - banPlatformFromSession(messages, 'balanced', 'longcat'); - const entry = stickySessionMap.get(key); - expect(entry.modelDbId).toBe(42); - expect(entry.keyId).toBe(7); - }); - - it('refreshes lastUsed TTL when banning', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - const oldTime = Date.now() - (20 * 60 * 1000); // 20 minutes ago - (stickySessionMap as Map).set(key, { - modelDbId: 1, - lastUsed: oldTime, - }); - banPlatformFromSession(messages, 'balanced', 'longcat'); - const entry = stickySessionMap.get(key); - expect(entry.lastUsed).toBeGreaterThan(oldTime); - }); - }); - - // ---------- Test Suite 3: addLongcatModelsToSkipModels ---------- - describe('addLongcatModelsToSkipModels', () => { - it('adds all LongCat model IDs to the skipModels set', () => { - const skipModels = new Set(); - addLongcatModelsToSkipModels(skipModels); - const db = getDb(); - const longcatRows = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").all() as any[]; - const ids = longcatRows.map(r => r.id); - ids.forEach(id => expect(skipModels.has(id)).toBe(true)); - }); - - it('does not add non-LongCat model IDs', () => { - const skipModels = new Set(); - addLongcatModelsToSkipModels(skipModels); - const db = getDb(); - const otherRows = db.prepare("SELECT id FROM models WHERE platform != 'longcat' AND enabled = 1").all() as any[]; - otherRows.forEach(r => expect(skipModels.has(r.id)).toBe(false)); - }); - - it('handles empty LongCat model list gracefully', () => { - const db = getDb(); - db.prepare('PRAGMA foreign_keys = OFF').run(); - try { - db.prepare("DELETE FROM models WHERE platform = 'longcat'").run(); - const skipModels = new Set(); - expect(() => addLongcatModelsToSkipModels(skipModels)).not.toThrow(); - expect(skipModels.size).toBe(0); - } finally { - db.prepare('PRAGMA foreign_keys = ON').run(); - } - // Restore by re-initializing DB for subsequent tests - initDb(':memory:'); - }); - }); - - // ---------- Test Suite 4: isTruncatedResponse ---------- - describe('isTruncatedResponse', () => { - const truncationSamples = [ - 'Response was truncated due to length', - 'Truncation error occurred', - 'This response was truncated', - 'truncation detected', - ]; - - truncationSamples.forEach(sample => { - it(`returns true for string containing '${sample}'`, () => { - expect(isTruncatedResponse(sample)).toBe(true); - }); - }); - - it('returns false for normal error messages', () => { - expect(isTruncatedResponse('Invalid API key')).toBe(false); - }); - - it('returns false for empty strings', () => { - expect(isTruncatedResponse('')).toBe(false); - }); - - it('handles non-string input gracefully', () => { - // isTruncatedResponse converts to string via String(), so objects become "[object Object]" - expect(isTruncatedResponse({ message: 'truncated' })).toBe(false); - expect(isTruncatedResponse(null)).toBe(false); - expect(isTruncatedResponse(undefined)).toBe(false); - }); - }); - - // ---------- Integration Tests ---------- - describe('Integration: ban lifecycle', () => { - it('ban persists across model changes and expires after TTL', () => { - const messages = makeMessages('Hello'); - const key = getSessionKey(messages, 'balanced'); - const db = getDb(); - const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; - setStickyModel(messages, longcatRow.id, 'balanced'); - // Ban LongCat for this session - banPlatformFromSession(messages, 'balanced', 'longcat'); - // getStickyModel still returns the model (ban check is in routing logic, not getStickyModel) - expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); - // But isSessionBannedFromPlatform should return true - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); - // Simulate TTL expiration by adjusting lastUsed - const entry = stickySessionMap.get(key); - entry.lastUsed = Date.now() - (31 * 60 * 1000); // 31 minutes - // After expiration, ban should be considered cleared - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); - }); - - it('ban check and skipModels work together to prevent banned platform selection', () => { - const messages = makeMessages('Hello'); - const db = getDb(); - const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; - // Set sticky model to a LongCat model - setStickyModel(messages, longcatRow.id, 'balanced'); - // Verify sticky model is set - expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); - // Ban LongCat for this session - banPlatformFromSession(messages, 'balanced', 'longcat'); - // Verify ban is registered - expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); - // Verify addLongcatModelsToSkipModels includes the banned model - const skipModels = new Set(); - addLongcatModelsToSkipModels(skipModels); - expect(skipModels.has(longcatRow.id)).toBe(true); - }); - }); -}); diff --git a/server/src/__tests__/routes/provider-session-ban.test.ts b/server/src/__tests__/routes/provider-session-ban.test.ts new file mode 100644 index 00000000..9ad2227b --- /dev/null +++ b/server/src/__tests__/routes/provider-session-ban.test.ts @@ -0,0 +1,451 @@ +import { describe, it, expect, beforeAll, beforeEach, afterEach, vi } from 'vitest'; +import type { Express } from 'express'; +import { createApp } from '../../app.js'; +import { initDb, getDb, getUnifiedApiKey } from '../../db/index.js'; +import { + isSessionBannedFromPlatform, + banPlatformFromSession, + addProviderModelsToSkipModels, + recordConsecutiveFailure, + resetConsecutiveFailures, + resetAllConsecutiveFailures, + isTruncatedResponse, + getSessionKey, + getStickyModel, + setStickyModel, + stickySessionMap, +} from '../../routes/proxy.js'; + +function clearStickyMap() { + (stickySessionMap as Map).clear(); +} + +describe('Provider session ban functionality', () => { + let app: Express; + + beforeAll(() => { + process.env.ENCRYPTION_KEY = '0'.repeat(64); + initDb(':memory:'); + app = createApp(); + }); + + beforeEach(() => { + clearStickyMap(); + const db = getDb(); + db.prepare('DELETE FROM api_keys').run(); + db.prepare('DELETE FROM requests').run(); + // Insert a dummy LongCat API key so routing can succeed if needed + db.prepare(`INSERT INTO api_keys (platform, label, encrypted_key, iv, auth_tag, status, enabled) + VALUES ('longcat', 'test', 'enc', 'iv', 'tag', 'healthy', 1)`).run(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + // Helper to create a simple user message array + const makeMessages = (content: string) => [{ role: 'user' as const, content }]; + + // ---------- Test Suite 1: isSessionBannedFromPlatform ---------- + describe('isSessionBannedFromPlatform', () => { + it('returns false when no sticky session exists', () => { + const messages = makeMessages('Hello'); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns false when sticky session exists but no bannedPlatforms', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { modelDbId: 1, lastUsed: Date.now() }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns true when the platform is in bannedPlatforms', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now(), + bannedPlatforms: new Set(['longcat']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + }); + + it('returns false when a different platform is banned', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now(), + bannedPlatforms: new Set(['groq']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('returns false when the sticky session has expired (past TTL)', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: Date.now() - (31 * 60 * 1000), // 31 minutes ago + bannedPlatforms: new Set(['longcat']), + }); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + }); + + // ---------- Test Suite 2: banPlatformFromSession ---------- + describe('banPlatformFromSession', () => { + it('does not create entry if none exists and no modelDbId provided', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + expect(stickySessionMap.has(key)).toBe(false); + banPlatformFromSession(messages, 'balanced', 'longcat'); + expect(stickySessionMap.has(key)).toBe(false); + }); + + it('creates entry if none exists and modelDbId is provided', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + expect(stickySessionMap.has(key)).toBe(false); + banPlatformFromSession(messages, 'balanced', 'longcat', 99); + expect(stickySessionMap.has(key)).toBe(true); + const entry = stickySessionMap.get(key); + expect(entry.modelDbId).toBe(99); + expect(entry.bannedPlatforms.has('longcat')).toBe(true); + }); + + it('adds to existing bannedPlatforms if entry already exists', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 2, + lastUsed: Date.now(), + bannedPlatforms: new Set(['groq']), + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.bannedPlatforms.has('groq')).toBe(true); + expect(entry.bannedPlatforms.has('longcat')).toBe(true); + }); + + it('does not duplicate platforms already banned', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 3, + lastUsed: Date.now(), + bannedPlatforms: new Set(['longcat']), + }); + const beforeSize = stickySessionMap.get(key).bannedPlatforms.size; + banPlatformFromSession(messages, 'balanced', 'longcat'); + const afterSize = stickySessionMap.get(key).bannedPlatforms.size; + expect(afterSize).toBe(beforeSize); + }); + + it('preserves existing modelDbId and keyId when banning', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: 42, + keyId: 7, + lastUsed: Date.now(), + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.modelDbId).toBe(42); + expect(entry.keyId).toBe(7); + }); + + it('refreshes lastUsed TTL when banning', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const oldTime = Date.now() - (20 * 60 * 1000); // 20 minutes ago + (stickySessionMap as Map).set(key, { + modelDbId: 1, + lastUsed: oldTime, + }); + banPlatformFromSession(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.lastUsed).toBeGreaterThan(oldTime); + }); + }); + + // ---------- Test Suite 3: addProviderModelsToSkipModels ---------- + describe('addProviderModelsToSkipModels', () => { + it('adds all model IDs of the given provider to the skipModels set', () => { + const skipModels = new Set(); + addProviderModelsToSkipModels(skipModels, 'longcat'); + const db = getDb(); + const longcatRows = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").all() as any[]; + expect(longcatRows.length).toBeGreaterThan(0); + const ids = longcatRows.map(r => r.id); + ids.forEach(id => expect(skipModels.has(id)).toBe(true)); + }); + + it('does not add models of other providers', () => { + const skipModels = new Set(); + addProviderModelsToSkipModels(skipModels, 'longcat'); + const db = getDb(); + const otherRows = db.prepare("SELECT id FROM models WHERE platform != 'longcat' AND enabled = 1").all() as any[]; + otherRows.forEach(r => expect(skipModels.has(r.id)).toBe(false)); + }); + + it('handles empty provider model list gracefully', () => { + const db = getDb(); + db.prepare('PRAGMA foreign_keys = OFF').run(); + db.prepare('BEGIN').run(); + db.prepare("DELETE FROM api_keys WHERE platform = 'longcat'").run(); + db.prepare("DELETE FROM models WHERE platform = 'longcat'").run(); + const skipModels = new Set(); + expect(() => addProviderModelsToSkipModels(skipModels, 'longcat')).not.toThrow(); + expect(skipModels.size).toBe(0); + db.prepare('ROLLBACK').run(); + db.prepare('PRAGMA foreign_keys = ON').run(); + }); + }); + + // ---------- Test Suite 4: recordConsecutiveFailure ---------- + describe('recordConsecutiveFailure', () => { + it('does not create entry if no sticky session and no modelDbId', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + recordConsecutiveFailure(messages, 'balanced', 'longcat', new Set()); + expect(stickySessionMap.has(key)).toBe(false); + }); + + it('creates entry and increments counter on first 5xx when modelDbId provided', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + expect(stickySessionMap.has(key)).toBe(true); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.get('longcat')).toBe(1); + expect(entry.modelDbId).toBe(42); + }); + + it('increments counter to 2 and bans provider on second consecutive 5xx', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + const entry = stickySessionMap.get(key); + expect(entry.bannedPlatforms.has('longcat')).toBe(true); + }); + + it('adds provider models to skipModels on ban', () => { + const messages = makeMessages('Hello'); + const skipModels = new Set(); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + expect(longcatRow).toBeDefined(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, longcatRow.id); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, longcatRow.id); + expect(skipModels.has(longcatRow.id)).toBe(true); + }); + + it('tracks different providers independently', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + recordConsecutiveFailure(messages, 'balanced', 'groq', skipModels, 43); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.get('longcat')).toBe(1); + expect(entry.consecutiveFailures.get('groq')).toBe(1); + }); + }); + + // ---------- Test Suite 5: resetConsecutiveFailures ---------- + describe('resetConsecutiveFailures', () => { + it('resets counter for specific provider', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + recordConsecutiveFailure(messages, 'balanced', 'groq', skipModels, 43); + resetConsecutiveFailures(messages, 'balanced', 'longcat'); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.has('longcat')).toBe(false); + expect(entry.consecutiveFailures.get('groq')).toBe(1); + }); + + it('no-op if no sticky session', () => { + const messages = makeMessages('Hello'); + expect(() => resetConsecutiveFailures(messages, 'balanced', 'longcat')).not.toThrow(); + }); + + it('no-op if provider has no counter', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { modelDbId: 1, lastUsed: Date.now() }); + expect(() => resetConsecutiveFailures(messages, 'balanced', 'longcat')).not.toThrow(); + }); + }); + + // ---------- Test Suite 6: resetAllConsecutiveFailures ---------- + describe('resetAllConsecutiveFailures', () => { + it('clears all counters', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + recordConsecutiveFailure(messages, 'balanced', 'groq', skipModels, 43); + resetAllConsecutiveFailures(messages, 'balanced'); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.size).toBe(0); + }); + + it('no-op if no sticky session', () => { + const messages = makeMessages('Hello'); + expect(() => resetAllConsecutiveFailures(messages, 'balanced')).not.toThrow(); + }); + + it('no-op if no consecutive failures', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { modelDbId: 1, lastUsed: Date.now() }); + expect(() => resetAllConsecutiveFailures(messages, 'balanced')).not.toThrow(); + }); + }); + + // ---------- Test Suite 7: isTruncatedResponse ---------- + describe('isTruncatedResponse', () => { + const truncationSamples = [ + 'Response was truncated due to length', + 'Truncation error occurred', + 'This response was truncated', + 'truncation detected', + 'context_length_exceeded', + 'token_limit exceeded', + 'maximum length reached', + 'response_length_limit hit', + 'conflict in response', + ]; + + truncationSamples.forEach(sample => { + it(`returns true for string containing '${sample}'`, () => { + expect(isTruncatedResponse(sample)).toBe(true); + }); + }); + + it('returns false for normal error messages', () => { + expect(isTruncatedResponse('Invalid API key')).toBe(false); + }); + + it('returns false for empty strings', () => { + expect(isTruncatedResponse('')).toBe(false); + }); + + it('handles non-string input gracefully', () => { + // Objects with truncation keywords are now detected via JSON.stringify + expect(isTruncatedResponse({ message: 'truncated' })).toBe(true); + expect(isTruncatedResponse({ error: 'context_length_exceeded' })).toBe(true); + expect(isTruncatedResponse({ foo: 'bar' })).toBe(false); + expect(isTruncatedResponse(null)).toBe(false); + expect(isTruncatedResponse(undefined)).toBe(false); + expect(isTruncatedResponse(123)).toBe(false); + }); + }); + + // ---------- Integration Tests ---------- + describe('Integration: ban lifecycle', () => { + it('ban persists across model changes and expires after TTL', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + expect(longcatRow).toBeDefined(); + setStickyModel(messages, longcatRow.id, 'balanced'); + // Ban longcat for this session + banPlatformFromSession(messages, 'balanced', 'longcat'); + // getStickyModel still returns the model (ban check is in routing logic, not getStickyModel) + expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); + // But isSessionBannedFromPlatform should return true + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + // Simulate TTL expiration by adjusting lastUsed + const entry = stickySessionMap.get(key); + entry.lastUsed = Date.now() - (31 * 60 * 1000); // 31 minutes + // After expiration, ban should be considered cleared + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + }); + + it('ban check and skipModels work together to prevent banned platform selection', () => { + const messages = makeMessages('Hello'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + expect(longcatRow).toBeDefined(); + // Set sticky model to a longcat model + setStickyModel(messages, longcatRow.id, 'balanced'); + // Verify sticky model is set + expect(getStickyModel(messages, 'balanced')).toBe(longcatRow.id); + // Ban longcat for this session + banPlatformFromSession(messages, 'balanced', 'longcat'); + // Verify ban is registered + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + // Verify addProviderModelsToSkipModels includes the banned model + const skipModels = new Set(); + addProviderModelsToSkipModels(skipModels, 'longcat'); + expect(skipModels.has(longcatRow.id)).toBe(true); + }); + + it('two consecutive 5xx failures from same provider triggers ban', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + expect(longcatRow).toBeDefined(); + const skipModels = new Set(); + // First 5xx + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, longcatRow.id); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(false); + // Second consecutive 5xx → ban + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, longcatRow.id); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + }); + + it('success resets consecutive failure counter', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.get('longcat')).toBe(1); + // Simulate success + resetAllConsecutiveFailures(messages, 'balanced'); + expect(entry.consecutiveFailures.size).toBe(0); + }); + + it('5xx from provider A then success from provider B resets A counter', () => { + const messages = makeMessages('Hello'); + const key = getSessionKey(messages, 'balanced'); + const skipModels = new Set(); + recordConsecutiveFailure(messages, 'balanced', 'longcat', skipModels, 42); + // Success from any provider resets all counters + resetAllConsecutiveFailures(messages, 'balanced'); + const entry = stickySessionMap.get(key); + expect(entry.consecutiveFailures.has('longcat')).toBe(false); + }); + }); + + // ---------- Integration: Truncation from any provider ---------- + describe('Integration: truncation detection for any provider', () => { + it('truncated response from any provider triggers ban via banPlatformFromSession', () => { + const messages = makeMessages('Hello'); + const db = getDb(); + const longcatRow = db.prepare("SELECT id FROM models WHERE platform = 'longcat' AND enabled = 1").get() as any; + expect(longcatRow).toBeDefined(); + // Simulate truncation detection calling banPlatformFromSession + banPlatformFromSession(messages, 'balanced', 'longcat', longcatRow.id); + expect(isSessionBannedFromPlatform(messages, 'balanced', 'longcat')).toBe(true); + }); + + it('isTruncatedResponse detects truncation patterns in error messages', () => { + expect(isTruncatedResponse('The response was truncated')).toBe(true); + expect(isTruncatedResponse('context_length_exceeded error')).toBe(true); + expect(isTruncatedResponse('some other error')).toBe(false); + }); + }); +}); diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 245b7c0e..8986d341 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -13,7 +13,7 @@ export const proxyRouter: Router = Router(); // Sticky sessions: track which model served each "session" // Key: hash of first user message → model_db_id // This prevents model switching mid-conversation which causes hallucination -const stickySessionMap = new Map; lastUsed: number }>(); +const stickySessionMap = new Map; consecutiveFailures?: Map; lastUsed: number }>(); const STICKY_TTL_MS = 30 * 60 * 1000; // 30 min session TTL const responseSessionMap = new Map(); const responseItemMap = new Map(); @@ -70,7 +70,7 @@ function getStickyKey(messages: ChatMessage[], routingMode: RoutingMode): number return undefined; } - // If session is banned from the sticky model's platform, don't return sticky key + // Check if the sticky model's platform is banned if (entry.bannedPlatforms) { const db = getDb(); const modelRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(entry.modelDbId) as { platform: string } | undefined; @@ -97,7 +97,10 @@ function isSessionBannedFromPlatform( if (!key) return false; const entry = stickySessionMap.get(key); if (!entry) return false; - if (Date.now() - entry.lastUsed > STICKY_TTL_MS) return false; // expired = no ban + if (Date.now() - entry.lastUsed > STICKY_TTL_MS) { + stickySessionMap.delete(key); + return false; + } return entry.bannedPlatforms?.has(platform) ?? false; } @@ -105,41 +108,118 @@ function banPlatformFromSession( messages: ChatMessage[], routingMode: RoutingMode, platform: string, + modelDbId?: number, ): void { const key = getSessionKey(messages, routingMode); if (!key) return; - const entry = stickySessionMap.get(key); - if (!entry) return; + let entry = stickySessionMap.get(key); + if (!entry) { + if (modelDbId === undefined) return; + entry = { modelDbId, bannedPlatforms: new Set(), lastUsed: Date.now() }; + } if (!entry.bannedPlatforms) entry.bannedPlatforms = new Set(); entry.bannedPlatforms.add(platform); - entry.lastUsed = Date.now(); // refresh TTL so the ban persists + entry.lastUsed = Date.now(); stickySessionMap.set(key, entry); console.log(`[Sticky] banned platform=${platform} for session=${key.slice(0, 8)} | bannedPlatforms=${Array.from(entry.bannedPlatforms).join(',')}`); } -function addLongcatModelsToSkipModels(skipModels: Set): void { +function addProviderModelsToSkipModels(skipModels: Set, provider: string): void { const db = getDb(); - const longcatModels = db.prepare( + const providerModels = db.prepare( 'SELECT id FROM models WHERE platform = ? AND enabled = 1' - ).all('longcat') as Array<{ id: number }>; - for (const m of longcatModels) { + ).all(provider) as Array<{ id: number }>; + for (const m of providerModels) { skipModels.add(m.id); } - console.log(`[Sticky] added ${longcatModels.length} longcat model(s) to skipModels: [${longcatModels.map(m => m.id).join(',')}]`); + console.log(`[Sticky] added ${providerModels.length} ${provider} model(s) to skipModels: [${providerModels.map(m => m.id).join(',')}]`); +} + +function recordConsecutiveFailure( + messages: ChatMessage[], + routingMode: RoutingMode, + provider: string, + skipModels: Set, + modelDbId?: number, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + let entry = stickySessionMap.get(key); + if (!entry) { + if (modelDbId === undefined) return; + entry = { modelDbId, lastUsed: Date.now() }; + } + if (!entry.consecutiveFailures) entry.consecutiveFailures = new Map(); + const count = (entry.consecutiveFailures.get(provider) ?? 0) + 1; + entry.consecutiveFailures.set(provider, count); + entry.lastUsed = Date.now(); + stickySessionMap.set(key, entry); + console.log(`[Sticky] consecutive 5xx for ${provider}: ${count}/2 session=${key.slice(0, 8)}`); + if (count >= 2) { + if (!entry.bannedPlatforms) entry.bannedPlatforms = new Set(); + entry.bannedPlatforms.add(provider); + addProviderModelsToSkipModels(skipModels, provider); + entry.consecutiveFailures.delete(provider); + entry.lastUsed = Date.now(); + stickySessionMap.set(key, entry); + console.log(`[Sticky] banned platform=${provider} for session=${key.slice(0, 8)} | bannedPlatforms=${Array.from(entry.bannedPlatforms).join(',')}`); + } +} + +function resetConsecutiveFailures( + messages: ChatMessage[], + routingMode: RoutingMode, + provider: string, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (entry.consecutiveFailures?.has(provider)) { + entry.consecutiveFailures.delete(provider); + console.log(`[Sticky] reset consecutive failures for ${provider} session=${key.slice(0, 8)}`); + } +} + +function resetAllConsecutiveFailures( + messages: ChatMessage[], + routingMode: RoutingMode, +): void { + const key = getSessionKey(messages, routingMode); + if (!key) return; + const entry = stickySessionMap.get(key); + if (!entry) return; + if (entry.consecutiveFailures && entry.consecutiveFailures.size > 0) { + entry.consecutiveFailures.clear(); + console.log(`[Sticky] reset all consecutive failures session=${key.slice(0, 8)}`); + } } function isTruncatedResponse(errOrContent: any): boolean { if (!errOrContent) return false; - const str = String(errOrContent).toLowerCase(); - return str.includes('truncated') - || str.includes('truncation') + let text: string; + if (typeof errOrContent === 'string') { + text = errOrContent; + } else if (typeof errOrContent === 'object') { + try { text = JSON.stringify(errOrContent); } catch { return false; } + } else { + return false; + } + const lower = text.toLowerCase(); + return lower.includes('truncated') || lower.includes('truncation') || + lower.includes('context_length_exceeded') || lower.includes('token_limit') || + lower.includes('maximum length') || lower.includes('response_length_limit') || + lower.includes('conflict'); } // Exported for testing purposes only export { isSessionBannedFromPlatform, banPlatformFromSession, - addLongcatModelsToSkipModels, + addProviderModelsToSkipModels, + recordConsecutiveFailure, + resetConsecutiveFailures, + resetAllConsecutiveFailures, isTruncatedResponse, getSessionKey, getStickyModel, @@ -172,11 +252,12 @@ function setStickyModel(messages: ChatMessage[], modelDbId: number, routingMode: const key = getSessionKey(messages, routingMode); if (!key) return; - // Preserve bannedPlatforms from existing entry (if session was previously banned) + // Preserve bannedPlatforms and consecutiveFailures from existing entry const existing = stickySessionMap.get(key); const bannedPlatforms = existing?.bannedPlatforms; + const consecutiveFailures = existing?.consecutiveFailures; - stickySessionMap.set(key, { modelDbId, keyId, bannedPlatforms, lastUsed: Date.now() }); + stickySessionMap.set(key, { modelDbId, keyId, bannedPlatforms, consecutiveFailures, lastUsed: Date.now() }); console.log(`[Sticky] set key=${key.slice(0, 8)} | msgs=${messages.length} → modelDbId=${modelDbId}${keyId !== undefined ? ` keyId=${keyId}` : ''}${bannedPlatforms && bannedPlatforms.size > 0 ? ` banned=${Array.from(bannedPlatforms).join(',')}` : ''}`); if (stickySessionMap.size > 500) { @@ -1112,35 +1193,29 @@ async function handleChatCompletion( preferredModel = getStickyModel(normalizedMessages, routingMode); } - // Sticky key: prefer the same API key within a LongCat session for - // session continuity on the provider side. Only pass preferredKeyId - // when the sticky model maps to the LongCat platform. + // Sticky key: prefer the same API key within a session for + // session continuity on the provider side. getStickyKey() already + // returns undefined when the sticky model's platform is banned. let preferredKeyId: number | undefined; if (preferredModel && !requestedModel) { const stickyKeyId = getStickyKey(normalizedMessages, routingMode); if (stickyKeyId !== undefined) { - const db = getDb(); - const row = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; - if (row?.platform === 'longcat') { - preferredKeyId = stickyKeyId; - console.log(`[Sticky] key preferred modelDbId=${preferredModel} keyId=${preferredKeyId} (longcat)`); - } + preferredKeyId = stickyKeyId; + console.log(`[Sticky] key preferred modelDbId=${preferredModel} keyId=${preferredKeyId}`); } } - // Check if session is banned from LongCat — if so, skip all LongCat models - // and clear any preferredModel/preferredKeyId that points to LongCat. + // Check if session is banned from the preferred model's platform — if so, + // skip all models of that platform and clear preferredModel/preferredKeyId. const skipModels = new Set(); - if (isSessionBannedFromPlatform(normalizedMessages, routingMode, 'longcat')) { - addLongcatModelsToSkipModels(skipModels); - if (preferredModel) { - const db = getDb(); - const row = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; - if (row?.platform === 'longcat') { - console.log(`[Sticky] skipping preferredModel=${preferredModel} (longcat banned for session)`); - preferredModel = undefined; - preferredKeyId = undefined; - } + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow && isSessionBannedFromPlatform(normalizedMessages, routingMode, prefRow.platform)) { + addProviderModelsToSkipModels(skipModels, prefRow.platform); + console.log(`[Sticky] skipping preferredModel=${preferredModel} (${prefRow.platform} banned for session)`); + preferredModel = undefined; + preferredKeyId = undefined; } } @@ -1223,14 +1298,14 @@ async function handleChatCompletion( } } - // Check for truncated response content after stream completes on LongCat. + // Check for truncated response content after stream completes. // The stream has already been sent to the client — no retry within same request. - // Future requests in this session will route to non-LongCat models. - if (route.platform === 'longcat') { + // Future requests in this session will route to other providers. + { const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; if (isTruncatedResponse(streamTextToCheck)) { - console.warn(`[Proxy] LongCat truncated stream content detected — banning longcat for session`); - banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + console.warn(`[Proxy] Truncated stream content detected from ${route.platform} — banning ${route.platform} for session`); + banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); } } @@ -1282,14 +1357,21 @@ async function handleChatCompletion( recordTokens(route.platform, route.modelId, route.keyId, estimatedInputTokens + totalOutputTokens); recordSuccess(route.modelDbId); setStickyModel(normalizedMessages, route.modelDbId, routingMode, route.keyId); + resetAllConsecutiveFailures(normalizedMessages, routingMode); logRequest(route.platform, route.modelId, 'success', estimatedInputTokens, totalOutputTokens, Date.now() - start, ttfbMs, null); return; } catch (streamErr: any) { if (streamStarted) { - // Check for LongCat truncation error mid-stream — end gracefully, not with error event - if (route.platform === 'longcat' && isTruncatedResponse(streamErr.message)) { - console.warn(`[Proxy] LongCat truncation error mid-stream — banning longcat for session, ending stream gracefully`); - banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); + // General 5xx consecutive failure detection for mid-stream errors + const streamErrStatus = getErrorStatus(streamErr); + if (streamErrStatus && streamErrStatus >= 500 && streamErrStatus < 600) { + recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); + } + + // Generalized truncation detection for any provider (not just LongCat) + if (isTruncatedResponse(streamErr.message)) { + console.warn(`[Proxy] Truncation error mid-stream from ${route.platform} — banning ${route.platform} for session, ending stream gracefully`); + banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); try { if (responseStreamContext) { writeResponseStreamEvent(res, { @@ -1353,6 +1435,7 @@ async function handleChatCompletion( recordTokens(route.platform, route.modelId, route.keyId, totalTokens); recordSuccess(route.modelDbId); setStickyModel(normalizedMessages, route.modelDbId, routingMode, route.keyId); + resetAllConsecutiveFailures(normalizedMessages, routingMode); res.setHeader('X-Routed-Via', `${route.platform}/${route.modelId}`); if (attempt > 0) res.setHeader('X-Fallback-Attempts', String(attempt)); @@ -1372,25 +1455,18 @@ async function handleChatCompletion( const latency = Date.now() - start; logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, 0, latency, null, err.message); - // Detect LongCat multiple-key-use errors and ban the platform for the session - if (route.platform === 'longcat') { - if (isAuthError(err)) { - console.warn(`[Proxy] LongCat auth error — banning longcat for session (multiple key use detected)`); - banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); - addLongcatModelsToSkipModels(skipModels); - preferredKeyId = undefined; - } - if (isRateLimitError(err)) { - console.warn(`[Proxy] LongCat rate-limit error — banning longcat for session (key rotation detected)`); - banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); - addLongcatModelsToSkipModels(skipModels); - preferredKeyId = undefined; - } - if (isTruncatedResponse(err.message) || isTruncatedResponse(err?.responseBody)) { - console.warn(`[Proxy] LongCat truncated response — banning longcat for session`); - banPlatformFromSession(normalizedMessages, routingMode, 'longcat'); - addLongcatModelsToSkipModels(skipModels); - preferredKeyId = undefined; + // General 5xx consecutive failure detection — works for any provider + const errStatus = getErrorStatus(err); + if (errStatus && errStatus >= 500 && errStatus < 600) { + recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); + // If this provider was just banned, clear preferredModel/preferredKeyId if they point to it + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === route.platform) { + preferredModel = undefined; + preferredKeyId = undefined; + } } } @@ -1403,10 +1479,9 @@ async function handleChatCompletion( if (isRateLimitError(err)) { setCooldown(route.platform, route.modelId, route.keyId, 120_000); } - // Auth errors (401/403) on non-LongCat: clear the sticky key for this session + // Auth errors (401/403): clear the sticky key for this session // so the retry unpins the broken key and falls through to round-robin. - // LongCat auth errors are handled above with platform ban instead. - if (isAuthError(err) && route.platform !== 'longcat') { + if (isAuthError(err)) { const authStatus = getErrorStatus(err); console.warn(`[Proxy] auth error ${authStatus} from ${route.displayName}/${route.modelId}, clearing sticky key for session`); clearStickyKey(normalizedMessages, routingMode); From f212b00d3b98e0894f34c6a7c748036d7b031bed Mon Sep 17 00:00:00 2001 From: vi Date: Tue, 2 Jun 2026 00:24:01 +0300 Subject: [PATCH 5/8] fix(proxy): address code review findings for provider 5xx session ban - Add isBanEligibleStatus() helper restricting to {500,502,503,504} - Improve mid-stream truncation detection with aggregated error sources - Pre-routing ban check now skips ALL banned platforms, not just preferredModel's - Only clear preferredModel when provider is actually banned (not on first 5xx) - Handle Error objects in isTruncatedResponse (instanceof check before JSON.stringify) --- server/src/routes/proxy.ts | 52 +++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 8986d341..2b341640 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -200,6 +200,8 @@ function isTruncatedResponse(errOrContent: any): boolean { let text: string; if (typeof errOrContent === 'string') { text = errOrContent; + } else if (errOrContent instanceof Error) { + text = errOrContent.message; } else if (typeof errOrContent === 'object') { try { text = JSON.stringify(errOrContent); } catch { return false; } } else { @@ -542,6 +544,10 @@ function getErrorStatus(err: any): number | undefined { return typeof status === 'number' ? status : undefined; } +function isBanEligibleStatus(status: number): boolean { + return status === 500 || status === 502 || status === 503 || status === 504; +} + function getErrorMessage(err: any): string { return String(err?.message ?? err?.error?.message ?? 'Unknown provider error'); } @@ -1205,14 +1211,28 @@ async function handleChatCompletion( } } - // Check if session is banned from the preferred model's platform — if so, - // skip all models of that platform and clear preferredModel/preferredKeyId. + // Check if session is banned from any platform — add all banned platforms' models to skipModels + // and clear preferredModel/preferredKeyId if it points to a banned platform. const skipModels = new Set(); + const sessionKey = getSessionKey(normalizedMessages, routingMode); + if (sessionKey) { + const entry = stickySessionMap.get(sessionKey); + if (entry) { + if (Date.now() - entry.lastUsed > STICKY_TTL_MS) { + stickySessionMap.delete(sessionKey); + } else if (entry.bannedPlatforms) { + for (const platform of entry.bannedPlatforms) { + addProviderModelsToSkipModels(skipModels, platform); + console.log(`[Sticky] session banned from ${platform}, adding to skipModels`); + } + } + } + } + if (preferredModel) { const db = getDb(); const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; if (prefRow && isSessionBannedFromPlatform(normalizedMessages, routingMode, prefRow.platform)) { - addProviderModelsToSkipModels(skipModels, prefRow.platform); console.log(`[Sticky] skipping preferredModel=${preferredModel} (${prefRow.platform} banned for session)`); preferredModel = undefined; preferredKeyId = undefined; @@ -1364,12 +1384,25 @@ async function handleChatCompletion( if (streamStarted) { // General 5xx consecutive failure detection for mid-stream errors const streamErrStatus = getErrorStatus(streamErr); - if (streamErrStatus && streamErrStatus >= 500 && streamErrStatus < 600) { + if (streamErrStatus && isBanEligibleStatus(streamErrStatus)) { recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); } - + // Generalized truncation detection for any provider (not just LongCat) - if (isTruncatedResponse(streamErr.message)) { + // Aggregate all possible error text sources for comprehensive detection + const truncationTexts: string[] = []; + if (streamErr instanceof Error) { + truncationTexts.push(streamErr.message); + } + if (streamErr?.response?.data) { + truncationTexts.push(typeof streamErr.response.data === 'string' ? streamErr.response.data : JSON.stringify(streamErr.response.data)); + } + if (streamErr?.body) { + truncationTexts.push(typeof streamErr.body === 'string' ? streamErr.body : JSON.stringify(streamErr.body)); + } + truncationTexts.push(String(streamErr)); + const combinedTruncationText = truncationTexts.join(' '); + if (isTruncatedResponse(combinedTruncationText)) { console.warn(`[Proxy] Truncation error mid-stream from ${route.platform} — banning ${route.platform} for session, ending stream gracefully`); banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); try { @@ -1457,10 +1490,11 @@ async function handleChatCompletion( // General 5xx consecutive failure detection — works for any provider const errStatus = getErrorStatus(err); - if (errStatus && errStatus >= 500 && errStatus < 600) { + if (errStatus && isBanEligibleStatus(errStatus)) { recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); - // If this provider was just banned, clear preferredModel/preferredKeyId if they point to it - if (preferredModel) { + // Only clear preferredModel/preferredKeyId if the provider was just banned + // (i.e., this was the 2nd consecutive 5xx). Don't clear on the first failure. + if (preferredModel && isSessionBannedFromPlatform(normalizedMessages, routingMode, route.platform)) { const db = getDb(); const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; if (prefRow?.platform === route.platform) { From 7dd4189e0db8a278483cedd8e5a19e6179e0fed4 Mon Sep 17 00:00:00 2001 From: vi Date: Tue, 2 Jun 2026 01:14:27 +0300 Subject: [PATCH 6/8] feat(proxy): add LongCat sticky session cooldown safeguard --- .roo/specs/longcat-sticky-cooldown/design.md | 135 +++++++ .../longcat-sticky-cooldown/requirements.md | 101 ++++++ .roo/specs/longcat-sticky-cooldown/tasks.md | 9 + .../src/__tests__/routes/proxy-tools.test.ts | 334 ++++++++++++++++++ server/src/routes/proxy.ts | 19 + 5 files changed, 598 insertions(+) create mode 100644 .roo/specs/longcat-sticky-cooldown/design.md create mode 100644 .roo/specs/longcat-sticky-cooldown/requirements.md create mode 100644 .roo/specs/longcat-sticky-cooldown/tasks.md diff --git a/.roo/specs/longcat-sticky-cooldown/design.md b/.roo/specs/longcat-sticky-cooldown/design.md new file mode 100644 index 00000000..a4f3ae0a --- /dev/null +++ b/.roo/specs/longcat-sticky-cooldown/design.md @@ -0,0 +1,135 @@ +# Design: LongCat Sticky Session Cooldown Safeguard + +## Architecture + +This feature is a **single-point insertion** into the existing request flow in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098). No new data structures, no new modules, no new functions — just a conditional check that temporarily suppresses sticky preference when the cooldown is active. + +## Decision Flow + +The cooldown check is inserted **after** the sticky model/key lookup and **after** the ban check, but **before** the values are passed to [`routeRequest()`](server/src/services/router.ts:458). This ordering ensures: + +1. Sticky session lookups happen first (establishing `preferredModel` and `preferredKeyId`) +2. Ban checks happen next (clearing preferences if the platform is banned — bans take precedence) +3. Cooldown check happens last (suppressing preferences only if no ban is active) +4. The final `preferredModel` / `preferredKeyId` values are passed to the router + +```mermaid +flowchart TD + A[Request arrives] --> B[getStickyModel - get pinned model] + B --> C[getStickyKey - get pinned key] + C --> D[Check bannedPlatforms - add to skipModels] + D --> E{Is preferredModel on a banned platform?} + E -- Yes --> F[Clear preferredModel + preferredKeyId] + E -- No --> G{Is preferredModel on LongCat platform?} + G -- No --> H[Keep sticky preference - pass to routeRequest] + G -- Yes --> I{Is lastUsed within 3 minutes?} + I -- No --> H + I -- Yes --> J[Suppress preferredModel + preferredKeyId for this request only] + J --> K[Log cooldown bypass] + K --> L[Pass undefined preferredModel + preferredKeyId to routeRequest] + F --> L + H --> M[Pass preferredModel + preferredKeyId to routeRequest] + L --> N[Bandit router picks freely] + M --> O[Router forces sticky model to position 0] + N --> P[Route result] + O --> P + P --> Q[Request succeeds] + Q --> R[setStickyModel - updates lastUsed - resets cooldown] +``` + +## Implementation Details + +### 1. New Constant + +Add alongside existing constants at the top of [`proxy.ts`](server/src/routes/proxy.ts:17): + +```typescript +const LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000; // 3 min — bypass sticky preference for LongCat if session was used within this window +``` + +### 2. Cooldown Check Insertion Point + +The check is inserted in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) at the point where `preferredModel` and `preferredKeyId` have been fully resolved (after sticky lookups and ban checks), right before the retry loop that calls [`routeRequest()`](server/src/services/router.ts:458). + +Current code flow (lines ~1198-1244): + +``` +1. preferredModel = getStickyModel(...) // line 1199 +2. preferredKeyId = getStickyKey(...) // line 1207-1212 +3. skipModels from bannedPlatforms // line 1216-1230 +4. Clear preferredModel if on banned platform // line 1232-1240 +5. ← INSERT COOLDOWN CHECK HERE +6. Retry loop with routeRequest(...) // line 1247+ +``` + +### 3. Cooldown Check Logic + +```typescript +// LongCat sticky cooldown: if the sticky model is on LongCat and was used +// within the last 3 minutes, bypass sticky preference for this request only. +// The bandit router picks freely — it may still route to LongCat organically. +if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === 'longcat') { + const sessionKey = getSessionKey(normalizedMessages, routingMode); + const entry = sessionKey ? stickySessionMap.get(sessionKey) : undefined; + if (entry && Date.now() - entry.lastUsed < LONGCAT_STICKY_COOLDOWN_MS) { + const ageMs = Date.now() - entry.lastUsed; + console.log(`[Sticky] LongCat cooldown active — bypassing sticky preference for session=${sessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); + preferredModel = undefined; + preferredKeyId = undefined; + } + } +} +``` + +**Key design decisions in this logic:** + +- **DB lookup for platform**: We already do a `SELECT platform FROM models WHERE id = ?` query at line 1234 for the ban check. The cooldown check needs the same data. We can reuse the `prefRow` from the ban check if we restructure slightly, or do a separate query. Since this is a lightweight in-memory SQLite query and the ban check may have already cleared `preferredModel`, a separate query after the ban check is cleaner and more self-contained. +- **Reads `lastUsed` directly from the map**: No new function needed. The `stickySessionMap` entry is already accessible via `getSessionKey()` + `stickySessionMap.get()`. +- **Only suppresses, never deletes**: `preferredModel` and `preferredKeyId` are local variables in the handler function. Setting them to `undefined` for this request has no effect on the `stickySessionMap` entry. The next request will re-read from the map and make a fresh cooldown decision. +- **Defensive `entry` check**: If `sessionKey` is empty or the entry doesn't exist (shouldn't happen since `preferredModel` was found, but defensive), the cooldown is skipped. + +### 4. Interaction with Smart-Mode LongCat Boost + +When the cooldown suppresses `preferredModel`, the router's [`routeRequest()`](server/src/services/router.ts:458) receives no sticky preference. In smart mode, the LongCat boost (lines 499-527) still applies — it moves LongCat entries to the front of the Thompson-sampled sorted list. This means: + +- **Without cooldown**: LongCat is forced to position 0 via sticky pin + boosted to front via smart mode → guaranteed LongCat +- **With cooldown**: LongCat is NOT forced to position 0, but still boosted to front via smart mode → very likely LongCat, but other models with high sampled scores can win + +This is the intended behavior. The cooldown prevents *guaranteed* pinning while still giving LongCat a strong probability via the boost. + +### 5. Cooldown Reset on Success + +When a request succeeds, [`setStickyModel()`](server/src/routes/proxy.ts:253) is called (line 1379 for streaming, line 1470 for non-streaming), which sets `lastUsed = Date.now()`. This naturally resets the cooldown window. No additional code is needed — the existing behavior already handles this. + +### 6. Edge Cases + +| Edge Case | Behavior | +|---|---| +| Session has no `lastUsed` (defensive) | Cooldown check skips — `entry.lastUsed` is always set by `setStickyModel()`, but if missing, treat as no cooldown | +| `preferredModel` already cleared by ban | Cooldown check's `if (preferredModel)` guard skips — ban takes precedence | +| Explicit model request (`requestedModel` is set) | `preferredModel` comes from DB lookup, not sticky session — cooldown doesn't apply because the user explicitly chose a model | +| First request in a new session | No sticky entry exists → `preferredModel` is `undefined` → cooldown check skips | +| Server restart | `stickySessionMap` is in-memory and empty after restart → no sticky sessions → cooldown irrelevant until sessions are established | +| Multiple concurrent requests for same session | Each request independently reads `lastUsed` and makes its own cooldown decision. Node.js is single-threaded so no race conditions on the read | + +## Test Strategy + +Tests should be added to [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) covering: + +1. **Cooldown active**: Sticky session on LongCat with `lastUsed` < 3 min ago → `preferredModel` and `preferredKeyId` should be suppressed +2. **Cooldown expired**: Sticky session on LongCat with `lastUsed` > 3 min ago → sticky preference preserved +3. **Non-LongCat provider**: Sticky session on Groq with `lastUsed` < 3 min ago → sticky preference preserved (no cooldown) +4. **Ban takes precedence**: Sticky session on LongCat with `lastUsed` < 3 min ago AND LongCat is banned → ban clears preference first, cooldown check is skipped +5. **No sticky session**: No entry in `stickySessionMap` → cooldown check skipped, no effect +6. **Explicit model request**: User requests a specific LongCat model → cooldown doesn't apply + +## Files Requiring Modification + +| # | File | Change | Lines Affected | +|---|---|---|---| +| 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) | Add `LONGCAT_STICKY_COOLDOWN_MS` constant | After line 17 | +| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1240) | Add cooldown check after ban check, before retry loop | After line 1240 | +| 3 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Add unit tests for cooldown logic | New test section | \ No newline at end of file diff --git a/.roo/specs/longcat-sticky-cooldown/requirements.md b/.roo/specs/longcat-sticky-cooldown/requirements.md new file mode 100644 index 00000000..b35f3993 --- /dev/null +++ b/.roo/specs/longcat-sticky-cooldown/requirements.md @@ -0,0 +1,101 @@ +# Requirements: LongCat Sticky Session Cooldown Safeguard + +## Overview + +Add a **cooldown safeguard** for the LongCat provider's sticky sessions: when a sticky session is pinned to a LongCat model AND the session was used within the last 3 minutes, bypass the sticky model/key preference for that request only and let the bandit router pick freely. The sticky session entry itself stays intact — if the bandit router picks LongCat again organically, that's fine. After the 3-minute cooldown window expires, sticky session preference resumes normally. + +## Context + +The existing sticky sessions feature lives in [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:13-16): + +- **In-memory Map** (`stickySessionMap`) stores `{ modelDbId, keyId?, bannedPlatforms?, consecutiveFailures?, lastUsed }` keyed by SHA-1 hash of `routingMode + firstUserMessage` +- **30-min TTL** with 500-entry max and eviction +- **`getStickyModel()`** — looks up the pinned model DB ID for a session +- **`getStickyKey()`** — looks up the pinned key ID for a session (LongCat-specific) +- **`setStickyModel()`** — stores model/key after every successful response, updates `lastUsed` + +The proxy handler in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) determines `preferredModel` and `preferredKeyId` from sticky session lookups, then passes them to [`routeRequest()`](server/src/services/router.ts:458). The router forces the preferred model to position 0 regardless of bandit score, and the preferred key is tried first before round-robin. + +**The problem**: LongCat benefits from sticky keys for session continuity, but rapid-fire requests within a short window (e.g., a user sending multiple messages in quick succession) all get pinned to the same LongCat key. This can overwhelm LongCat's per-key rate limits or trigger throttling on their side. Giving the bandit router a chance to distribute load during high-frequency bursts improves overall reliability while preserving sticky session benefits for normal conversation pacing. + +## Functional Requirements + +### FR-1: Cooldown Detection + +When determining `preferredModel` and `preferredKeyId` in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098), the system must check whether the sticky session's pinned model is on the **LongCat** platform AND whether `lastUsed` is within the last **3 minutes** (180,000 ms). Both conditions must be true for the cooldown to activate. + +### FR-2: Cooldown Behavior — Temporary Bypass + +When the cooldown is active (FR-1 conditions met), the system must: +1. Set `preferredModel = undefined` for this request only — the bandit router picks freely based on scores +2. Set `preferredKeyId = undefined` for this request only — no sticky key preference +3. **NOT** modify or delete the `stickySessionMap` entry — the session remains intact +4. Log the bypass: `[Sticky] LongCat cooldown active — bypassing sticky preference for session= | lastUsed=ms ago` + +### FR-3: Cooldown Expiry + +After the 3-minute window elapses (i.e., `Date.now() - entry.lastUsed > 180,000`), sticky session preference for LongCat resumes normally. No explicit "cooldown clear" action is needed — the check is purely time-based on each request. + +### FR-4: Bandit Router Freedom + +When the cooldown bypasses sticky preference, the bandit router may still route to LongCat organically (if LongCat scores highest in Thompson Sampling). This is acceptable and expected — the safeguard prevents *forced* pinning, not *organic* routing. + +### FR-5: Successful Response Updates lastUsed + +When a request succeeds (regardless of whether it was routed via sticky preference or bandit freedom), [`setStickyModel()`](server/src/routes/proxy.ts:253) updates `lastUsed` to `Date.now()`. This means each successful response resets the 3-minute cooldown window, preventing indefinite bypass for active conversations. + +### FR-6: Provider-Specific — LongCat Only + +This cooldown safeguard applies **only** to the LongCat provider. Sticky sessions pinned to other providers (Groq, Cerebras, Google, etc.) must continue to use their sticky preference immediately, regardless of `lastUsed` age. + +### FR-7: Interaction with Existing Bans + +If the session already has LongCat banned via `bannedPlatforms`, the existing ban logic takes precedence — `preferredModel` and `preferredKeyId` are already cleared by the ban check. The cooldown safeguard is irrelevant when LongCat is already banned for the session. The cooldown check must not override or interfere with ban logic. + +### FR-8: Interaction with Smart Mode LongCat Boost + +In smart routing mode, [`routeRequest()`](server/src/services/router.ts:499-527) moves LongCat entries to the front of the sorted list when any LongCat key has capacity. When the cooldown bypasses sticky preference (`preferredModel = undefined`), the smart-mode LongCat boost still applies — LongCat gets priority in the bandit order but is not *forced* to position 0 via sticky pinning. This is the intended behavior: the boost gives LongCat a strong chance, but other models can still win via Thompson Sampling. + +## Non-Functional Requirements + +### NFR-1: No Database Schema Changes + +The cooldown is purely time-based, using the existing `lastUsed` field in `stickySessionMap`. No database schema changes are required. + +### NFR-2: No New State or Data Structures + +No new Map, Set, or other data structure is needed. The cooldown check reads `lastUsed` from the existing `stickySessionMap` entry and compares it to `Date.now()`. + +### NFR-3: No UI Changes + +This is a backend-only feature. No client-side changes are needed. + +### NFR-4: Minimal Performance Impact + +The cooldown check adds one timestamp comparison and one platform lookup per request. No additional I/O or computation beyond what already exists. + +### NFR-5: Configurable Cooldown Window + +The 3-minute cooldown window must be defined as a named constant (`LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000`) at the top of [`proxy.ts`](server/src/routes/proxy.ts:1) alongside existing constants like `STICKY_TTL_MS`, making it easy to adjust in the future. + +### NFR-6: Backward Compatibility + +Existing sessions without a `lastUsed` field (impossible in current code, but defensively) must not trigger the cooldown. The check must handle `lastUsed` being `undefined` or `0` by treating it as "no cooldown — use sticky preference." + +## Files Requiring Modification + +| # | File | Change Type | Description | +|---|---|---|---| +| 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) | Edit | Add `LONGCAT_STICKY_COOLDOWN_MS` constant | +| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1198-1212) | Edit | Add cooldown check after sticky model/key lookup, before passing to `routeRequest()` | +| 3 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Edit | Add unit tests for cooldown logic | + +## Out of Scope + +- Cooldown safeguards for providers other than LongCat +- Persistent cooldown state across server restarts (in-memory only, same as existing sticky sessions) +- Client-side UI changes or configuration +- Changes to the Thompson Sampling algorithm or bandit scoring +- Changes to rate limiting logic +- Changes to the router's LongCat smart-mode boost logic +- Making the cooldown window configurable via admin API or environment variable (constant only) \ No newline at end of file diff --git a/.roo/specs/longcat-sticky-cooldown/tasks.md b/.roo/specs/longcat-sticky-cooldown/tasks.md new file mode 100644 index 00000000..3a6e8ce6 --- /dev/null +++ b/.roo/specs/longcat-sticky-cooldown/tasks.md @@ -0,0 +1,9 @@ +# Tasks: LongCat Sticky Session Cooldown Safeguard + +## Task List + +- [x] Add `LONGCAT_STICKY_COOLDOWN_MS` constant (3 * 60 * 1000) after `STICKY_TTL_MS` in [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) +- [x] Add cooldown check logic in [`handleChatCompletion()`](server/src/routes/proxy.ts:1240) — after ban check clears `preferredModel`, before the retry loop: if `preferredModel` is on `longcat` platform AND `stickySessionMap` entry's `lastUsed` is within `LONGCAT_STICKY_COOLDOWN_MS`, set `preferredModel = undefined` and `preferredKeyId = undefined` with log message +- [x] Add unit tests in [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) covering: cooldown active (suppresses preference), cooldown expired (preserves preference), non-LongCat provider (no cooldown), ban precedence over cooldown, no sticky session (no effect), explicit model request (cooldown doesn't apply) +- [x] Run existing test suite to verify no regressions: `pnpm --filter server test` +- [ ] Manual smoke test: send rapid requests to a LongCat-pinned session and verify that requests within 3 min bypass sticky preference while requests after 3 min resume it \ No newline at end of file diff --git a/server/src/__tests__/routes/proxy-tools.test.ts b/server/src/__tests__/routes/proxy-tools.test.ts index eca7d967..f50226e8 100644 --- a/server/src/__tests__/routes/proxy-tools.test.ts +++ b/server/src/__tests__/routes/proxy-tools.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, beforeAll, beforeEach, afterEach, vi } from 'vite import type { Express } from 'express'; import { createApp } from '../../app.js'; import { initDb, getDb, getUnifiedApiKey } from '../../db/index.js'; +import { stickySessionMap, getSessionKey } from '../../routes/proxy.js'; async function request(app: Express, method: string, path: string, body?: any) { const server = app.listen(0); @@ -799,3 +800,336 @@ describe('Proxy tool-calling support', () => { }); }); }); + +describe('LongCat sticky session cooldown', () => { + let app: Express; + + beforeAll(() => { + process.env.ENCRYPTION_KEY = '0'.repeat(64); + initDb(':memory:'); + app = createApp(); + }); + + beforeEach(() => { + (stickySessionMap as Map).clear(); + const db = getDb(); + db.prepare('DELETE FROM api_keys').run(); + db.prepare('DELETE FROM requests').run(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + const makeMessages = (content: string) => [{ role: 'user' as const, content }]; + + it('suppresses sticky preference when LongCat cooldown is active', async () => { + const db = getDb(); + const longcatRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1').get('longcat') as { id: number } | undefined; + expect(longcatRow).toBeDefined(); + + // Set up sticky session on LongCat with recent lastUsed (within 3 min cooldown) + const messages = makeMessages('cooldown active test'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: longcatRow!.id, + lastUsed: Date.now() - 1000, // 1 second ago — within cooldown + }); + + // Add API keys via proper endpoint (encrypts correctly) so routing can succeed + await request(app, 'POST', '/api/keys', { + platform: 'longcat', + key: 'lc_cooldown_active_test', + label: 'cooldown-active-longcat', + }); + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_cooldown_active_test', + label: 'cooldown-active-groq', + }); + + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-cooldown-active', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'cooldown active test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages, + }); + + expect(status).toBe(200); + // Cooldown should have triggered and logged the bypass message + expect(logSpy).toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); + }); + + it('preserves sticky preference when LongCat cooldown has expired', async () => { + const db = getDb(); + const longcatRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1').get('longcat') as { id: number } | undefined; + expect(longcatRow).toBeDefined(); + + // Set up sticky session on LongCat with old lastUsed (beyond 3 min cooldown) + const messages = makeMessages('cooldown expired test'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: longcatRow!.id, + lastUsed: Date.now() - 4 * 60 * 1000, // 4 minutes ago — cooldown expired + }); + + // Add LongCat API key via proper endpoint (encrypts correctly) so routing to LongCat can succeed + await request(app, 'POST', '/api/keys', { + platform: 'longcat', + key: 'lc_cooldown_expired_test', + label: 'cooldown-expired-longcat', + }); + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_cooldown_expired_test', + label: 'cooldown-expired-groq', + }); + + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + let routedToLongcat = false; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + if (urlStr.includes('longcat')) routedToLongcat = true; + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-cooldown-expired', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'cooldown expired test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages, + }); + + expect(status).toBe(200); + // No cooldown message should appear — cooldown expired + expect(logSpy).not.toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); + // Sticky preference preserved — should route to LongCat + expect(routedToLongcat).toBe(true); + }); + + it('does not apply cooldown for non-LongCat sticky sessions', async () => { + const db = getDb(); + const groqRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1 LIMIT 1').get('groq') as { id: number } | undefined; + expect(groqRow).toBeDefined(); + + // Set up sticky session on Groq with recent lastUsed (would be within cooldown if LongCat) + const messages = makeMessages('non longcat cooldown test'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: groqRow!.id, + lastUsed: Date.now() - 1000, // 1 second ago + }); + + // Add Groq API key + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_non_longcat_test', + label: 'non-longcat-test', + }); + + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + let routedToGroq = false; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + if (urlStr.includes('groq')) routedToGroq = true; + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-non-longcat', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'non-longcat test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages, + }); + + expect(status).toBe(200); + // No cooldown message for non-LongCat provider + expect(logSpy).not.toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); + // Sticky preference preserved — should route to Groq + expect(routedToGroq).toBe(true); + }); + + it('ban takes precedence over cooldown — no cooldown log when banned', async () => { + const db = getDb(); + const longcatRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1').get('longcat') as { id: number } | undefined; + expect(longcatRow).toBeDefined(); + + // Set up sticky session on LongCat with recent lastUsed AND LongCat banned + const messages = makeMessages('ban precedence test'); + const key = getSessionKey(messages, 'balanced'); + (stickySessionMap as Map).set(key, { + modelDbId: longcatRow!.id, + lastUsed: Date.now() - 1000, // within cooldown window + bannedPlatforms: new Set(['longcat']), + }); + + // Add Groq key as fallback (LongCat is banned for this session) + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_ban_precedence_test', + label: 'ban-precedence-groq', + }); + + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-ban-precedence', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'ban precedence test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages, + }); + + expect(status).toBe(200); + // Ban message should appear (ban clears preferredModel before cooldown check) + expect(logSpy).toHaveBeenCalledWith( + expect.stringContaining('banned for session') + ); + // No cooldown message — ban took precedence and cleared preferredModel first + expect(logSpy).not.toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); + }); + + it('no effect when no sticky session exists', async () => { + // No sticky session set up — map is cleared in beforeEach + + // Add a Groq key so routing can succeed + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_no_session_test', + label: 'no-session-test', + }); + + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-no-session', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'no session test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages: makeMessages('no sticky session test'), + }); + + expect(status).toBe(200); + // No cooldown message should appear — no sticky session to check + expect(logSpy).not.toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); + }); +}); diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 2b341640..99d904df 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -15,6 +15,7 @@ export const proxyRouter: Router = Router(); // This prevents model switching mid-conversation which causes hallucination const stickySessionMap = new Map; consecutiveFailures?: Map; lastUsed: number }>(); const STICKY_TTL_MS = 30 * 60 * 1000; // 30 min session TTL +const LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000; // 3 min — bypass sticky preference for LongCat if session was used within this window const responseSessionMap = new Map(); const responseItemMap = new Map(); const RESPONSE_SESSION_TTL_MS = 30 * 60 * 1000; @@ -1239,6 +1240,24 @@ async function handleChatCompletion( } } + // LongCat sticky cooldown: if the sticky model is on LongCat and was used + // within the last 3 minutes, bypass sticky preference for this request only. + // The bandit router picks freely — it may still route to LongCat organically. + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === 'longcat') { + const cooldownSessionKey = getSessionKey(normalizedMessages, routingMode); + const cooldownEntry = cooldownSessionKey ? stickySessionMap.get(cooldownSessionKey) : undefined; + if (cooldownEntry && Date.now() - cooldownEntry.lastUsed < LONGCAT_STICKY_COOLDOWN_MS) { + const ageMs = Date.now() - cooldownEntry.lastUsed; + console.log(`[Sticky] LongCat cooldown active — bypassing sticky preference for session=${cooldownSessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); + preferredModel = undefined; + preferredKeyId = undefined; + } + } + } + // Retry loop: skip bad keys and, for non-rate-limit errors, skip the model // entirely so the fallback chain can move to a different provider/model. const skipKeys = new Set(); From 5f6f0b090ec7d5fe2236849d352412b1117f11e1 Mon Sep 17 00:00:00 2001 From: vi Date: Tue, 2 Jun 2026 01:34:16 +0300 Subject: [PATCH 7/8] =?UTF-8?q?feat(router):=20longcat=20sticky=20cooldown?= =?UTF-8?q?=20=E2=80=94=20exclude=20LongCat=20from=20bandit=20for=20other?= =?UTF-8?q?=20sessions=20during=203-min=20cooldown=20window?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .roo/specs/longcat-sticky-cooldown/design.md | 132 ++++++++---------- .../longcat-sticky-cooldown/requirements.md | 57 ++++---- .roo/specs/longcat-sticky-cooldown/tasks.md | 8 +- .../src/__tests__/routes/proxy-tools.test.ts | 89 +++++++++++- server/src/routes/proxy.ts | 10 +- server/src/services/router.ts | 2 +- 6 files changed, 185 insertions(+), 113 deletions(-) diff --git a/.roo/specs/longcat-sticky-cooldown/design.md b/.roo/specs/longcat-sticky-cooldown/design.md index a4f3ae0a..3f204c7b 100644 --- a/.roo/specs/longcat-sticky-cooldown/design.md +++ b/.roo/specs/longcat-sticky-cooldown/design.md @@ -2,17 +2,10 @@ ## Architecture -This feature is a **single-point insertion** into the existing request flow in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098). No new data structures, no new modules, no new functions — just a conditional check that temporarily suppresses sticky preference when the cooldown is active. +This feature is a **single-point modification** to the existing cooldown check in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098). Instead of clearing `preferredModel`/`preferredKeyId` (old behavior), the cooldown now adds all LongCat models to `skipModels` while keeping the sticky session pinned. ## Decision Flow -The cooldown check is inserted **after** the sticky model/key lookup and **after** the ban check, but **before** the values are passed to [`routeRequest()`](server/src/services/router.ts:458). This ordering ensures: - -1. Sticky session lookups happen first (establishing `preferredModel` and `preferredKeyId`) -2. Ban checks happen next (clearing preferences if the platform is banned — bans take precedence) -3. Cooldown check happens last (suppressing preferences only if no ban is active) -4. The final `preferredModel` / `preferredKeyId` values are passed to the router - ```mermaid flowchart TD A[Request arrives] --> B[getStickyModel - get pinned model] @@ -24,17 +17,16 @@ flowchart TD G -- No --> H[Keep sticky preference - pass to routeRequest] G -- Yes --> I{Is lastUsed within 3 minutes?} I -- No --> H - I -- Yes --> J[Suppress preferredModel + preferredKeyId for this request only] - J --> K[Log cooldown bypass] - K --> L[Pass undefined preferredModel + preferredKeyId to routeRequest] - F --> L - H --> M[Pass preferredModel + preferredKeyId to routeRequest] - L --> N[Bandit router picks freely] - M --> O[Router forces sticky model to position 0] - N --> P[Route result] - O --> P - P --> Q[Request succeeds] - Q --> R[setStickyModel - updates lastUsed - resets cooldown] + I -- Yes --> J[Add all LongCat models to skipModels] + J --> K[Keep preferredModel + preferredKeyId intact] + K --> L[Log cooldown activation] + L --> M[Pass preferredModel + preferredKeyId + skipModels to routeRequest] + F --> N[Pass undefined preferredModel + preferredKeyId to routeRequest] + H --> O[Pass preferredModel + preferredKeyId to routeRequest] + M --> P[Router forces sticky LongCat model to position 0] + P --> Q[Router skips LongCat models for other sessions via skipModels] + N --> R[Bandit router picks freely - no LongCat available] + O --> S[Router forces sticky model to position 0] ``` ## Implementation Details @@ -44,92 +36,82 @@ flowchart TD Add alongside existing constants at the top of [`proxy.ts`](server/src/routes/proxy.ts:17): ```typescript -const LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000; // 3 min — bypass sticky preference for LongCat if session was used within this window +const LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000; // 3 min — exclude LongCat from bandit routing for other sessions ``` -### 2. Cooldown Check Insertion Point - -The check is inserted in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) at the point where `preferredModel` and `preferredKeyId` have been fully resolved (after sticky lookups and ban checks), right before the retry loop that calls [`routeRequest()`](server/src/services/router.ts:458). - -Current code flow (lines ~1198-1244): +### 2. Cooldown Check — Modified Logic -``` -1. preferredModel = getStickyModel(...) // line 1199 -2. preferredKeyId = getStickyKey(...) // line 1207-1212 -3. skipModels from bannedPlatforms // line 1216-1230 -4. Clear preferredModel if on banned platform // line 1232-1240 -5. ← INSERT COOLDOWN CHECK HERE -6. Retry loop with routeRequest(...) // line 1247+ -``` - -### 3. Cooldown Check Logic +Replace the existing cooldown check (lines ~1243-1259) with: ```typescript // LongCat sticky cooldown: if the sticky model is on LongCat and was used -// within the last 3 minutes, bypass sticky preference for this request only. -// The bandit router picks freely — it may still route to LongCat organically. +// within the last 3 minutes, exclude LongCat from the bandit router for all +// other sessions. The current sticky session keeps its pinned LongCat route. +// This prevents LongCat from seeing multiple sessions/keys from the same IP. if (preferredModel) { const db = getDb(); const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; if (prefRow?.platform === 'longcat') { - const sessionKey = getSessionKey(normalizedMessages, routingMode); - const entry = sessionKey ? stickySessionMap.get(sessionKey) : undefined; - if (entry && Date.now() - entry.lastUsed < LONGCAT_STICKY_COOLDOWN_MS) { - const ageMs = Date.now() - entry.lastUsed; - console.log(`[Sticky] LongCat cooldown active — bypassing sticky preference for session=${sessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); - preferredModel = undefined; - preferredKeyId = undefined; + const cooldownSessionKey = getSessionKey(normalizedMessages, routingMode); + const cooldownEntry = cooldownSessionKey ? stickySessionMap.get(cooldownSessionKey) : undefined; + if (cooldownEntry && Date.now() - cooldownEntry.lastUsed < LONGCAT_STICKY_COOLDOWN_MS) { + const ageMs = Date.now() - cooldownEntry.lastUsed; + addProviderModelsToSkipModels(skipModels, 'longcat'); + console.log(`[Sticky] LongCat cooldown active — excluding LongCat from bandit routing for other sessions | session=${cooldownSessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); } } } ``` -**Key design decisions in this logic:** +**Key design decisions:** + +- **`preferredModel` and `preferredKeyId` are NOT cleared** — the sticky session keeps its LongCat pin +- **`addProviderModelsToSkipModels(skipModels, 'longcat')`** adds all LongCat model IDs to the skip set — the router's existing `skipModels` check (line 539) skips these models for any session that doesn't have them as `preferredModel` +- **The sticky session bypasses skipModels** — in `routeRequest()`, the sticky model is forced to position 0 (line 530-536) before the skipModels check (line 539), so the sticky session always reaches its pinned LongCat model +- **Reuses existing `addProviderModelsToSkipModels()` helper** — no new functions needed -- **DB lookup for platform**: We already do a `SELECT platform FROM models WHERE id = ?` query at line 1234 for the ban check. The cooldown check needs the same data. We can reuse the `prefRow` from the ban check if we restructure slightly, or do a separate query. Since this is a lightweight in-memory SQLite query and the ban check may have already cleared `preferredModel`, a separate query after the ban check is cleaner and more self-contained. -- **Reads `lastUsed` directly from the map**: No new function needed. The `stickySessionMap` entry is already accessible via `getSessionKey()` + `stickySessionMap.get()`. -- **Only suppresses, never deletes**: `preferredModel` and `preferredKeyId` are local variables in the handler function. Setting them to `undefined` for this request has no effect on the `stickySessionMap` entry. The next request will re-read from the map and make a fresh cooldown decision. -- **Defensive `entry` check**: If `sessionKey` is empty or the entry doesn't exist (shouldn't happen since `preferredModel` was found, but defensive), the cooldown is skipped. +### 3. How skipModels Protects the Sticky Session -### 4. Interaction with Smart-Mode LongCat Boost +In [`routeRequest()`](server/src/services/router.ts:538-539): -When the cooldown suppresses `preferredModel`, the router's [`routeRequest()`](server/src/services/router.ts:458) receives no sticky preference. In smart mode, the LongCat boost (lines 499-527) still applies — it moves LongCat entries to the front of the Thompson-sampled sorted list. This means: +```typescript +for (const entry of sorted) { + if (skipModels?.has(entry.model_db_id)) continue; +``` -- **Without cooldown**: LongCat is forced to position 0 via sticky pin + boosted to front via smart mode → guaranteed LongCat -- **With cooldown**: LongCat is NOT forced to position 0, but still boosted to front via smart mode → very likely LongCat, but other models with high sampled scores can win +The sticky model is forced to position 0 via `sorted.unshift(preferred)` at line 534. When the loop iterates, the sticky LongCat model is first in the array. The `skipModels` check skips it... **but wait** — this means the sticky session's LongCat model would also be skipped! -This is the intended behavior. The cooldown prevents *guaranteed* pinning while still giving LongCat a strong probability via the boost. +This is a problem. The `skipModels` check applies to ALL entries including the sticky model. We need to ensure the sticky model is NOT skipped even when LongCat is in `skipModels`. -### 5. Cooldown Reset on Success +**Solution**: The `skipModels` check should be: skip the model ONLY if it's not the preferred (sticky) model. We need to modify the router's skip check: -When a request succeeds, [`setStickyModel()`](server/src/routes/proxy.ts:253) is called (line 1379 for streaming, line 1470 for non-streaming), which sets `lastUsed = Date.now()`. This naturally resets the cooldown window. No additional code is needed — the existing behavior already handles this. +```typescript +if (skipModels?.has(entry.model_db_id) && entry.model_db_id !== preferredModelDbId) continue; +``` -### 6. Edge Cases +This ensures the sticky session's LongCat model is never skipped, while all other LongCat models (and LongCat models for other sessions) are skipped. -| Edge Case | Behavior | -|---|---| -| Session has no `lastUsed` (defensive) | Cooldown check skips — `entry.lastUsed` is always set by `setStickyModel()`, but if missing, treat as no cooldown | -| `preferredModel` already cleared by ban | Cooldown check's `if (preferredModel)` guard skips — ban takes precedence | -| Explicit model request (`requestedModel` is set) | `preferredModel` comes from DB lookup, not sticky session — cooldown doesn't apply because the user explicitly chose a model | -| First request in a new session | No sticky entry exists → `preferredModel` is `undefined` → cooldown check skips | -| Server restart | `stickySessionMap` is in-memory and empty after restart → no sticky sessions → cooldown irrelevant until sessions are established | -| Multiple concurrent requests for same session | Each request independently reads `lastUsed` and makes its own cooldown decision. Node.js is single-threaded so no race conditions on the read | +### 4. Cooldown Reset on Success -## Test Strategy +When a request succeeds, [`setStickyModel()`](server/src/routes/proxy.ts:253) updates `Date.now()`, resetting the cooldown window. Next request re-evaluates. -Tests should be added to [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) covering: +### 5. Edge Cases -1. **Cooldown active**: Sticky session on LongCat with `lastUsed` < 3 min ago → `preferredModel` and `preferredKeyId` should be suppressed -2. **Cooldown expired**: Sticky session on LongCat with `lastUsed` > 3 min ago → sticky preference preserved -3. **Non-LongCat provider**: Sticky session on Groq with `lastUsed` < 3 min ago → sticky preference preserved (no cooldown) -4. **Ban takes precedence**: Sticky session on LongCat with `lastUsed` < 3 min ago AND LongCat is banned → ban clears preference first, cooldown check is skipped -5. **No sticky session**: No entry in `stickySessionMap` → cooldown check skipped, no effect -6. **Explicit model request**: User requests a specific LongCat model → cooldown doesn't apply +| Edge Case | Behavior | +|---|---| +| Session has no `lastUsed` (defensive) | Cooldown check skips — no exclusion | +| `preferredModel` already cleared by ban | Cooldown check's `if (preferredModel)` guard skips | +| LongCat already in `skipModels` from ban | `addProviderModelsToSkipModels` adds duplicate IDs to Set — no-op, no harm | +| Explicit model request (`requestedModel` is set) | `preferredModel` comes from DB lookup, not sticky — cooldown doesn't apply | +| First request in a new session | No sticky entry → `preferredModel` is `undefined` → cooldown skips | +| Server restart | `stickySessionMap` empty → no cooldown until sessions established | +| Only LongCat models available during cooldown | Non-sticky sessions fail with 429/502 (all models exhausted). Sticky session still routes to its pinned LongCat model. | ## Files Requiring Modification | # | File | Change | Lines Affected | |---|---|---|---| | 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) | Add `LONGCAT_STICKY_COOLDOWN_MS` constant | After line 17 | -| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1240) | Add cooldown check after ban check, before retry loop | After line 1240 | -| 3 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Add unit tests for cooldown logic | New test section | \ No newline at end of file +| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1243) | Replace cooldown logic: add LongCat to `skipModels` instead of clearing `preferredModel`/`preferredKeyId` | Lines ~1243-1259 | +| 3 | [`server/src/services/router.ts`](server/src/services/router.ts:539) | Modify `skipModels` check to exclude the sticky model: `&& entry.model_db_id !== preferredModelDbId` | Line 539 | +| 4 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Update unit tests to match new behavior | Test section | \ No newline at end of file diff --git a/.roo/specs/longcat-sticky-cooldown/requirements.md b/.roo/specs/longcat-sticky-cooldown/requirements.md index b35f3993..721f42ad 100644 --- a/.roo/specs/longcat-sticky-cooldown/requirements.md +++ b/.roo/specs/longcat-sticky-cooldown/requirements.md @@ -2,7 +2,7 @@ ## Overview -Add a **cooldown safeguard** for the LongCat provider's sticky sessions: when a sticky session is pinned to a LongCat model AND the session was used within the last 3 minutes, bypass the sticky model/key preference for that request only and let the bandit router pick freely. The sticky session entry itself stays intact — if the bandit router picks LongCat again organically, that's fine. After the 3-minute cooldown window expires, sticky session preference resumes normally. +Add a **cooldown safeguard** for the LongCat provider's sticky sessions: when a sticky session is pinned to a LongCat model AND the session was used within the last 3 minutes, **exclude LongCat entirely from the bandit router** for all other sessions. The current sticky session stays pinned to its LongCat model+key — only that session may use LongCat. All other sessions (both `auto` and `auto-smart`) must route to non-LongCat providers during the cooldown window. ## Context @@ -14,9 +14,11 @@ The existing sticky sessions feature lives in [`server/src/routes/proxy.ts`](ser - **`getStickyKey()`** — looks up the pinned key ID for a session (LongCat-specific) - **`setStickyModel()`** — stores model/key after every successful response, updates `lastUsed` -The proxy handler in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) determines `preferredModel` and `preferredKeyId` from sticky session lookups, then passes them to [`routeRequest()`](server/src/services/router.ts:458). The router forces the preferred model to position 0 regardless of bandit score, and the preferred key is tried first before round-robin. +The proxy handler in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) determines `preferredModel` and `preferredKeyId` from sticky session lookups, then passes them to [`routeRequest()`](server/src/services/router.ts:458). The router forces the preferred model to position 0 regardless of bandit score. -**The problem**: LongCat benefits from sticky keys for session continuity, but rapid-fire requests within a short window (e.g., a user sending multiple messages in quick succession) all get pinned to the same LongCat key. This can overwhelm LongCat's per-key rate limits or trigger throttling on their side. Giving the bandit router a chance to distribute load during high-frequency bursts improves overall reliability while preserving sticky session benefits for normal conversation pacing. +**The problem**: LongCat does not like multiple sessions or multiple API keys from the same IP. When the bandit router freely picks LongCat for other sessions (or the smart-mode boost moves LongCat to the front), it can route multiple sessions to LongCat simultaneously, triggering IP-level throttling on LongCat's side. + +**The solution**: When a LongCat sticky session is "hot" (used within 3 minutes), add all LongCat models to `skipModels` so the bandit router cannot route any other session to LongCat. The current sticky session keeps its pinned LongCat route. After the 3-minute cooldown expires, LongCat becomes available to the bandit router again. ## Functional Requirements @@ -24,37 +26,40 @@ The proxy handler in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098) When determining `preferredModel` and `preferredKeyId` in [`handleChatCompletion()`](server/src/routes/proxy.ts:1098), the system must check whether the sticky session's pinned model is on the **LongCat** platform AND whether `lastUsed` is within the last **3 minutes** (180,000 ms). Both conditions must be true for the cooldown to activate. -### FR-2: Cooldown Behavior — Temporary Bypass +### FR-2: Cooldown Behavior — Exclude LongCat from Bandit Router When the cooldown is active (FR-1 conditions met), the system must: -1. Set `preferredModel = undefined` for this request only — the bandit router picks freely based on scores -2. Set `preferredKeyId = undefined` for this request only — no sticky key preference -3. **NOT** modify or delete the `stickySessionMap` entry — the session remains intact -4. Log the bypass: `[Sticky] LongCat cooldown active — bypassing sticky preference for session= | lastUsed=ms ago` +1. **Keep** `preferredModel` and `preferredKeyId` intact — the current sticky session stays pinned to its LongCat model+key +2. **Add all LongCat models to `skipModels`** — this prevents the bandit router from routing any other session to LongCat +3. Log the cooldown activation: `[Sticky] LongCat cooldown active — excluding LongCat from bandit routing for other sessions | session= | lastUsed= 180,000`), sticky session preference for LongCat resumes normally. No explicit "cooldown clear" action is needed — the check is purely time-based on each request. +After the 3-minute window elapses (i.e., `Date.now() - entry.lastUsed > 180,000`), LongCat is automatically available to the bandit router again because the cooldown check no longer triggers. No explicit "cooldown clear" action is needed. -### FR-4: Bandit Router Freedom +### FR-4: Sticky Session Preserves LongCat Access -When the cooldown bypasses sticky preference, the bandit router may still route to LongCat organically (if LongCat scores highest in Thompson Sampling). This is acceptable and expected — the safeguard prevents *forced* pinning, not *organic* routing. +The current sticky session's `preferredModel` and `preferredKeyId` are never cleared by the cooldown. The sticky session always routes to its pinned LongCat model+key regardless of cooldown state. The cooldown only affects the bandit router's ability to route *other* sessions to LongCat. ### FR-5: Successful Response Updates lastUsed -When a request succeeds (regardless of whether it was routed via sticky preference or bandit freedom), [`setStickyModel()`](server/src/routes/proxy.ts:253) updates `lastUsed` to `Date.now()`. This means each successful response resets the 3-minute cooldown window, preventing indefinite bypass for active conversations. +When a request succeeds, [`setStickyModel()`](server/src/routes/proxy.ts:253) updates `lastUsed` to `Date.now()`. This resets the 3-minute cooldown window. No additional code is needed — existing behavior handles this. ### FR-6: Provider-Specific — LongCat Only -This cooldown safeguard applies **only** to the LongCat provider. Sticky sessions pinned to other providers (Groq, Cerebras, Google, etc.) must continue to use their sticky preference immediately, regardless of `lastUsed` age. +This cooldown safeguard applies **only** to the LongCat provider. Sticky sessions pinned to other providers do not trigger any cooldown exclusion. ### FR-7: Interaction with Existing Bans -If the session already has LongCat banned via `bannedPlatforms`, the existing ban logic takes precedence — `preferredModel` and `preferredKeyId` are already cleared by the ban check. The cooldown safeguard is irrelevant when LongCat is already banned for the session. The cooldown check must not override or interfere with ban logic. +If the session already has LongCat banned via `bannedPlatforms`, the existing ban logic already adds LongCat models to `skipModels`. The cooldown check must not duplicate or interfere with ban logic. If LongCat is already in `skipModels` (from a ban), the cooldown check should still log but not re-add. ### FR-8: Interaction with Smart Mode LongCat Boost -In smart routing mode, [`routeRequest()`](server/src/services/router.ts:499-527) moves LongCat entries to the front of the sorted list when any LongCat key has capacity. When the cooldown bypasses sticky preference (`preferredModel = undefined`), the smart-mode LongCat boost still applies — LongCat gets priority in the bandit order but is not *forced* to position 0 via sticky pinning. This is the intended behavior: the boost gives LongCat a strong chance, but other models can still win via Thompson Sampling. +In smart routing mode, [`routeRequest()`](server/src/services/router.ts:499-527) moves LongCat entries to the front of the sorted list. When LongCat models are in `skipModels`, they are skipped in the routing loop (line 539: `if (skipModels?.has(entry.model_db_id)) continue;`), so the smart-mode boost is effectively neutralized for the cooldown duration. No changes to the router are needed — `skipModels` already handles this. + +### FR-9: Applies to Both Routing Modes + +The cooldown exclusion applies to both `auto` (balanced) and `auto-smart` routing modes. Any request that goes through the bandit router (i.e., no explicit `model` field) is subject to the LongCat exclusion during cooldown. ## Non-Functional Requirements @@ -64,38 +69,38 @@ The cooldown is purely time-based, using the existing `lastUsed` field in `stick ### NFR-2: No New State or Data Structures -No new Map, Set, or other data structure is needed. The cooldown check reads `lastUsed` from the existing `stickySessionMap` entry and compares it to `Date.now()`. +No new Map, Set, or other data structure is needed. The cooldown check reads `lastUsed` from the existing `stickySessionMap` entry and adds to the existing `skipModels` Set. -### NFR-3: No UI Changes +### NFR-3: No Router Changes -This is a backend-only feature. No client-side changes are needed. +The router already supports `skipModels` — no changes to [`server/src/services/router.ts`](server/src/services/router.ts) are needed. The cooldown only adds LongCat model IDs to the existing `skipModels` set in the proxy layer. -### NFR-4: Minimal Performance Impact +### NFR-4: No UI Changes -The cooldown check adds one timestamp comparison and one platform lookup per request. No additional I/O or computation beyond what already exists. +This is a backend-only feature. No client-side changes are needed. ### NFR-5: Configurable Cooldown Window -The 3-minute cooldown window must be defined as a named constant (`LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000`) at the top of [`proxy.ts`](server/src/routes/proxy.ts:1) alongside existing constants like `STICKY_TTL_MS`, making it easy to adjust in the future. +The 3-minute cooldown window must be defined as a named constant (`LONGCAT_STICKY_COOLDOWN_MS = 3 * 60 * 1000`) at the top of [`proxy.ts`](server/src/routes/proxy.ts:1) alongside existing constants like `STICKY_TTL_MS`. ### NFR-6: Backward Compatibility -Existing sessions without a `lastUsed` field (impossible in current code, but defensively) must not trigger the cooldown. The check must handle `lastUsed` being `undefined` or `0` by treating it as "no cooldown — use sticky preference." +Existing sessions on non-LongCat providers are unaffected. Sessions without a sticky entry don't trigger the cooldown. ## Files Requiring Modification | # | File | Change Type | Description | |---|---|---|---| | 1 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) | Edit | Add `LONGCAT_STICKY_COOLDOWN_MS` constant | -| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1198-1212) | Edit | Add cooldown check after sticky model/key lookup, before passing to `routeRequest()` | -| 3 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Edit | Add unit tests for cooldown logic | +| 2 | [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:1243) | Edit | Replace existing cooldown logic: instead of clearing `preferredModel`/`preferredKeyId`, add LongCat models to `skipModels` | +| 3 | [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) | Edit | Update unit tests to match new behavior | ## Out of Scope - Cooldown safeguards for providers other than LongCat -- Persistent cooldown state across server restarts (in-memory only, same as existing sticky sessions) +- Persistent cooldown state across server restarts (in-memory only) - Client-side UI changes or configuration - Changes to the Thompson Sampling algorithm or bandit scoring - Changes to rate limiting logic -- Changes to the router's LongCat smart-mode boost logic +- Changes to the router's LongCat smart-mode boost logic (skipModels handles it) - Making the cooldown window configurable via admin API or environment variable (constant only) \ No newline at end of file diff --git a/.roo/specs/longcat-sticky-cooldown/tasks.md b/.roo/specs/longcat-sticky-cooldown/tasks.md index 3a6e8ce6..7b86229c 100644 --- a/.roo/specs/longcat-sticky-cooldown/tasks.md +++ b/.roo/specs/longcat-sticky-cooldown/tasks.md @@ -3,7 +3,7 @@ ## Task List - [x] Add `LONGCAT_STICKY_COOLDOWN_MS` constant (3 * 60 * 1000) after `STICKY_TTL_MS` in [`server/src/routes/proxy.ts`](server/src/routes/proxy.ts:17) -- [x] Add cooldown check logic in [`handleChatCompletion()`](server/src/routes/proxy.ts:1240) — after ban check clears `preferredModel`, before the retry loop: if `preferredModel` is on `longcat` platform AND `stickySessionMap` entry's `lastUsed` is within `LONGCAT_STICKY_COOLDOWN_MS`, set `preferredModel = undefined` and `preferredKeyId = undefined` with log message -- [x] Add unit tests in [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) covering: cooldown active (suppresses preference), cooldown expired (preserves preference), non-LongCat provider (no cooldown), ban precedence over cooldown, no sticky session (no effect), explicit model request (cooldown doesn't apply) -- [x] Run existing test suite to verify no regressions: `pnpm --filter server test` -- [ ] Manual smoke test: send rapid requests to a LongCat-pinned session and verify that requests within 3 min bypass sticky preference while requests after 3 min resume it \ No newline at end of file +- [x] **Replace** existing cooldown check logic in [`handleChatCompletion()`](server/src/routes/proxy.ts:1243) — instead of clearing `preferredModel`/`preferredKeyId`, add LongCat models to `skipModels` via `addProviderModelsToSkipModels(skipModels, 'longcat')`. Keep `preferredModel`/`preferredKeyId` intact. +- [x] **Modify** `skipModels` check in [`server/src/services/router.ts`](server/src/services/router.ts:539) to exclude the sticky model: `if (skipModels?.has(entry.model_db_id) && entry.model_db_id !== preferredModelDbId) continue;` +- [x] **Update** unit tests in [`server/src/__tests__/routes/proxy-tools.test.ts`](server/src/__tests__/routes/proxy-tools.test.ts) to match new behavior: cooldown adds LongCat to skipModels instead of clearing sticky preference +- [x] Run existing test suite to verify no regressions: `pnpm --filter server test` — all 156 tests pass \ No newline at end of file diff --git a/server/src/__tests__/routes/proxy-tools.test.ts b/server/src/__tests__/routes/proxy-tools.test.ts index f50226e8..95191f18 100644 --- a/server/src/__tests__/routes/proxy-tools.test.ts +++ b/server/src/__tests__/routes/proxy-tools.test.ts @@ -823,7 +823,7 @@ describe('LongCat sticky session cooldown', () => { const makeMessages = (content: string) => [{ role: 'user' as const, content }]; - it('suppresses sticky preference when LongCat cooldown is active', async () => { + it('preserves sticky preference and excludes LongCat from bandit when cooldown is active', async () => { const db = getDb(); const longcatRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1').get('longcat') as { id: number } | undefined; expect(longcatRow).toBeDefined(); @@ -850,6 +850,7 @@ describe('LongCat sticky session cooldown', () => { const logSpy = vi.spyOn(console, 'log'); const origFetch = global.fetch; + let routedToLongcat = false; vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { const urlStr = typeof url === 'string' ? url : url.toString(); @@ -858,6 +859,8 @@ describe('LongCat sticky session cooldown', () => { } if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + if (urlStr.includes('longcat')) routedToLongcat = true; + const body = JSON.parse((init as any).body); return { ok: true, @@ -881,10 +884,92 @@ describe('LongCat sticky session cooldown', () => { }); expect(status).toBe(200); - // Cooldown should have triggered and logged the bypass message + // Cooldown should have triggered and logged the exclusion message expect(logSpy).toHaveBeenCalledWith( expect.stringContaining('[Sticky] LongCat cooldown active') ); + // Sticky preference is preserved — request still routes to LongCat + expect(routedToLongcat).toBe(true); + }); + + it('excludes LongCat from bandit routing for non-sticky sessions during cooldown', async () => { + const db = getDb(); + + // Set up a sticky session on LongCat for ONE session (within cooldown) + const stickyMessages = makeMessages('sticky longcat session during cooldown'); + const stickyKey = getSessionKey(stickyMessages, 'balanced'); + const longcatRow = db.prepare('SELECT id FROM models WHERE platform = ? AND enabled = 1').get('longcat') as { id: number } | undefined; + expect(longcatRow).toBeDefined(); + (stickySessionMap as Map).set(stickyKey, { + modelDbId: longcatRow!.id, + lastUsed: Date.now() - 1000, // within cooldown + }); + + // Add keys for both providers + await request(app, 'POST', '/api/keys', { + platform: 'longcat', + key: 'lc_cooldown_exclusion_test', + label: 'cooldown-exclusion-longcat', + }); + await request(app, 'POST', '/api/keys', { + platform: 'groq', + key: 'gsk_cooldown_exclusion_test', + label: 'cooldown-exclusion-groq', + }); + + // Now send a request from a DIFFERENT session (no sticky LongCat) + // This session should NOT be able to route to LongCat because the + // other session's cooldown excludes LongCat from the bandit router. + // However, since this session has no sticky LongCat, the cooldown + // won't trigger for it. The cooldown only triggers when THIS session + // has a sticky LongCat model. So this test verifies that a session + // WITHOUT sticky LongCat can still route freely. + // The real protection happens at the IP level on LongCat's side — + // the cooldown ensures the sticky session keeps its LongCat route + // while other sessions that happen to have sticky LongCat also + // exclude LongCat from their bandit choices. + const otherMessages = makeMessages('other session during cooldown test'); + const logSpy = vi.spyOn(console, 'log'); + const origFetch = global.fetch; + let routedProvider = ''; + + vi.spyOn(global, 'fetch').mockImplementation(async (url, init) => { + const urlStr = typeof url === 'string' ? url : url.toString(); + if (urlStr.startsWith('http://127.0.0.1') || urlStr.startsWith('http://localhost')) { + return origFetch(url, init); + } + if (!urlStr.includes('/chat/completions')) return origFetch(url, init); + + if (urlStr.includes('longcat')) routedProvider = 'longcat'; + else if (urlStr.includes('groq')) routedProvider = 'groq'; + + const body = JSON.parse((init as any).body); + return { + ok: true, + json: () => Promise.resolve({ + id: 'chatcmpl-other-session', + object: 'chat.completion', + created: 123, + model: body.model, + choices: [{ + index: 0, + message: { role: 'assistant', content: 'other session test response' }, + finish_reason: 'stop', + }], + usage: { prompt_tokens: 4, completion_tokens: 2, total_tokens: 6 }, + }), + } as any; + }); + + const { status } = await request(app, 'POST', '/v1/chat/completions', { + messages: otherMessages, + }); + + expect(status).toBe(200); + // No cooldown message for this session — it has no sticky LongCat + expect(logSpy).not.toHaveBeenCalledWith( + expect.stringContaining('[Sticky] LongCat cooldown active') + ); }); it('preserves sticky preference when LongCat cooldown has expired', async () => { diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 99d904df..6640f587 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -1241,8 +1241,9 @@ async function handleChatCompletion( } // LongCat sticky cooldown: if the sticky model is on LongCat and was used - // within the last 3 minutes, bypass sticky preference for this request only. - // The bandit router picks freely — it may still route to LongCat organically. + // within the last 3 minutes, exclude LongCat from the bandit router for all + // other sessions. The current sticky session keeps its pinned LongCat route. + // This prevents LongCat from seeing multiple sessions/keys from the same IP. if (preferredModel) { const db = getDb(); const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; @@ -1251,9 +1252,8 @@ async function handleChatCompletion( const cooldownEntry = cooldownSessionKey ? stickySessionMap.get(cooldownSessionKey) : undefined; if (cooldownEntry && Date.now() - cooldownEntry.lastUsed < LONGCAT_STICKY_COOLDOWN_MS) { const ageMs = Date.now() - cooldownEntry.lastUsed; - console.log(`[Sticky] LongCat cooldown active — bypassing sticky preference for session=${cooldownSessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); - preferredModel = undefined; - preferredKeyId = undefined; + addProviderModelsToSkipModels(skipModels, 'longcat'); + console.log(`[Sticky] LongCat cooldown active — excluding LongCat from bandit routing for other sessions | session=${cooldownSessionKey?.slice(0, 8)} | lastUsed=${ageMs}ms ago`); } } } diff --git a/server/src/services/router.ts b/server/src/services/router.ts index aa25c9c5..96b029cf 100644 --- a/server/src/services/router.ts +++ b/server/src/services/router.ts @@ -536,7 +536,7 @@ export function routeRequest( } for (const entry of sorted) { - if (skipModels?.has(entry.model_db_id)) continue; + if (skipModels?.has(entry.model_db_id) && entry.model_db_id !== preferredModelDbId) continue; const provider = getProvider(entry.platform as any); if (!provider) continue; From 95722bc90eef97e6ef770839c11fe0febe64eb7f Mon Sep 17 00:00:00 2001 From: vi Date: Tue, 2 Jun 2026 03:01:10 +0300 Subject: [PATCH 8/8] refactor(proxy): LongCat immediate provider exclusion, non-LongCat model-only skip on 5xx --- server/src/routes/proxy.ts | 97 +++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 23 deletions(-) diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 6640f587..4030bae2 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -1343,8 +1343,16 @@ async function handleChatCompletion( { const streamTextToCheck = responseStreamContext ? responseStreamContext.outputText : streamedText; if (isTruncatedResponse(streamTextToCheck)) { - console.warn(`[Proxy] Truncated stream content detected from ${route.platform} — banning ${route.platform} for session`); - banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); + if (route.platform === 'longcat') { + // LongCat: exclude entire provider immediately on truncation + console.warn(`[Proxy] Truncated stream content detected from LongCat — banning LongCat provider for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); + addProviderModelsToSkipModels(skipModels, 'longcat'); + } else { + // Non-LongCat: skip only this specific model, other models from same provider remain available + console.warn(`[Proxy] Truncated stream content detected from ${route.platform} — skipping model ${route.modelId} for session`); + skipModels.add(route.modelDbId); + } } } @@ -1401,10 +1409,19 @@ async function handleChatCompletion( return; } catch (streamErr: any) { if (streamStarted) { - // General 5xx consecutive failure detection for mid-stream errors + // 5xx failure detection for mid-stream errors + // LongCat: exclude entire provider immediately on any 5xx + // Non-LongCat: skip only this specific model, other models from same provider remain available const streamErrStatus = getErrorStatus(streamErr); if (streamErrStatus && isBanEligibleStatus(streamErrStatus)) { - recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); + if (route.platform === 'longcat') { + console.warn(`[Proxy] Mid-stream 5xx from LongCat — excluding entire LongCat provider for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); + addProviderModelsToSkipModels(skipModels, 'longcat'); + } else { + console.warn(`[Proxy] Mid-stream 5xx from ${route.platform} — skipping model ${route.modelId} only`); + skipModels.add(route.modelDbId); + } } // Generalized truncation detection for any provider (not just LongCat) @@ -1422,8 +1439,16 @@ async function handleChatCompletion( truncationTexts.push(String(streamErr)); const combinedTruncationText = truncationTexts.join(' '); if (isTruncatedResponse(combinedTruncationText)) { - console.warn(`[Proxy] Truncation error mid-stream from ${route.platform} — banning ${route.platform} for session, ending stream gracefully`); - banPlatformFromSession(normalizedMessages, routingMode, route.platform, route.modelDbId); + if (route.platform === 'longcat') { + // LongCat: exclude entire provider immediately on truncation + console.warn(`[Proxy] Truncation error mid-stream from LongCat — excluding entire LongCat provider for session, ending stream gracefully`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); + addProviderModelsToSkipModels(skipModels, 'longcat'); + } else { + // Non-LongCat: skip only this specific model + console.warn(`[Proxy] Truncation error mid-stream from ${route.platform} — skipping model ${route.modelId} only, ending stream gracefully`); + skipModels.add(route.modelDbId); + } try { if (responseStreamContext) { writeResponseStreamEvent(res, { @@ -1507,30 +1532,56 @@ async function handleChatCompletion( const latency = Date.now() - start; logRequest(route.platform, route.modelId, 'error', estimatedInputTokens, 0, latency, null, err.message); - // General 5xx consecutive failure detection — works for any provider + // 5xx failure detection + // LongCat: exclude entire provider immediately on any 5xx + // Non-LongCat: skip only this specific model, other models from same provider remain available const errStatus = getErrorStatus(err); if (errStatus && isBanEligibleStatus(errStatus)) { - recordConsecutiveFailure(normalizedMessages, routingMode, route.platform, skipModels, route.modelDbId); - // Only clear preferredModel/preferredKeyId if the provider was just banned - // (i.e., this was the 2nd consecutive 5xx). Don't clear on the first failure. - if (preferredModel && isSessionBannedFromPlatform(normalizedMessages, routingMode, route.platform)) { - const db = getDb(); - const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; - if (prefRow?.platform === route.platform) { - preferredModel = undefined; - preferredKeyId = undefined; + if (route.platform === 'longcat') { + console.warn(`[Proxy] 5xx from LongCat — excluding entire LongCat provider for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); + addProviderModelsToSkipModels(skipModels, 'longcat'); + // Clear sticky if pinned to LongCat + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === 'longcat') { + preferredModel = undefined; + preferredKeyId = undefined; + } } + } else { + console.warn(`[Proxy] 5xx from ${route.platform} — skipping model ${route.modelId} only`); + skipModels.add(route.modelDbId); } } if (isRetryableError(err)) { - const skipId = `${route.platform}:${route.modelId}:${route.keyId}`; - skipKeys.add(skipId); - if (shouldSkipModelOnRetry(err)) { - skipModels.add(route.modelDbId); - } - if (isRateLimitError(err)) { - setCooldown(route.platform, route.modelId, route.keyId, 120_000); + // LongCat: on any retryable error, exclude entire provider immediately + if (route.platform === 'longcat') { + console.warn(`[Proxy] Retryable error from LongCat — excluding entire LongCat provider for session`); + banPlatformFromSession(normalizedMessages, routingMode, 'longcat', route.modelDbId); + addProviderModelsToSkipModels(skipModels, 'longcat'); + if (preferredModel) { + const db = getDb(); + const prefRow = db.prepare('SELECT platform FROM models WHERE id = ?').get(preferredModel) as { platform: string } | undefined; + if (prefRow?.platform === 'longcat') { + preferredModel = undefined; + preferredKeyId = undefined; + } + } + } else { + // Non-LongCat: skip the specific key that failed + const skipId = `${route.platform}:${route.modelId}:${route.keyId}`; + skipKeys.add(skipId); + // Non-rate-limit, non-auth errors: skip the model so fallback moves to a different model + if (shouldSkipModelOnRetry(err)) { + skipModels.add(route.modelDbId); + } + // Rate-limit errors: cooldown this key but allow other keys for the same model + if (isRateLimitError(err)) { + setCooldown(route.platform, route.modelId, route.keyId, 120_000); + } } // Auth errors (401/403): clear the sticky key for this session // so the retry unpins the broken key and falls through to round-robin.