diff --git a/core/core.ts b/core/core.ts index e0a222e7854..3a1877f66cb 100644 --- a/core/core.ts +++ b/core/core.ts @@ -15,7 +15,7 @@ import { DevDataSqliteDb } from "./data/devdataSqlite"; import { DataLogger } from "./data/log"; import { CodebaseIndexer } from "./indexing/CodebaseIndexer"; import DocsService from "./indexing/docs/DocsService"; -import { countTokens } from "./llm/countTokens"; +import { countTokens, getAvailableInputTokens } from "./llm/countTokens"; import Lemonade from "./llm/llms/Lemonade"; import { fetchModels } from "./llm/fetchModels"; import Ollama from "./llm/llms/Ollama"; @@ -1296,12 +1296,12 @@ export class Core { } const tokens = countTokens(item.content, llm.model); + const availableTokens = getAvailableInputTokens( + llm.contextLength, + llm.completionOptions!.maxTokens!, + ); - if (tokens > llm.contextLength - llm.completionOptions!.maxTokens!) { - return true; - } - - return false; + return tokens > availableTokens; } private handleAddAutocompleteModel( diff --git a/core/llm/countTokens.test.ts b/core/llm/countTokens.test.ts index fa4592cdc66..e3dbf42b646 100644 --- a/core/llm/countTokens.test.ts +++ b/core/llm/countTokens.test.ts @@ -7,6 +7,7 @@ import { countTokens, countTokensAsync, extractToolSequence, + getAvailableInputTokens, pruneLinesFromBottom, pruneLinesFromTop, pruneRawPromptFromTop, @@ -28,6 +29,34 @@ describe.skip("countTokens", () => { }); }); +describe("getAvailableInputTokens", () => { + it("reserves only a minimum response allowance, not the full maxTokens", () => { + // Small context window with the default 4096 maxTokens (e.g. a local model + // with no known completion limit). The old guard reserved the full 4096, + // leaving only contextLength - 4096; this reserves MIN_RESPONSE_TOKENS (1000) + // plus the safety buffer instead. + const contextLength = 8192; + const maxTokens = 4096; + + const available = getAvailableInputTokens(contextLength, maxTokens); + + // Far more headroom than the old `contextLength - maxTokens` formula. + expect(available).toBeGreaterThan(contextLength - maxTokens); + // Safety buffer is min(1000, 8192 * 0.02 = 163.84) = 163.84, response = 1000. + expect(available).toBeCloseTo(8192 - 163.84 - 1000, 2); + }); + + it("reserves maxTokens when it is below the minimum response allowance", () => { + const contextLength = 100_000; + const maxTokens = 256; + + const available = getAvailableInputTokens(contextLength, maxTokens); + + // Safety buffer caps at 1000; response reservation is the smaller maxTokens. + expect(available).toBe(100_000 - 1000 - 256); + }); +}); + describe("countTokensAsync", () => { afterAll(async () => { // Clean up the global async encoders to prevent Jest from hanging diff --git a/core/llm/countTokens.ts b/core/llm/countTokens.ts index b742d70b0f0..e4c6756b203 100644 --- a/core/llm/countTokens.ts +++ b/core/llm/countTokens.ts @@ -376,6 +376,26 @@ export function getTokenCountingBufferSafety(contextLength: number) { const MIN_RESPONSE_TOKENS = 1000; +/** + * Tokens available for input content in a single request. This mirrors how + * compileChatMessages budgets a request: reserve the counting safety buffer + * plus a minimum response allowance, rather than the full configured + * completion budget (`maxTokens`). Reserving the full `maxTokens` here makes a + * single context item appear too big on small context windows or on models + * with no known completion limit (where `maxTokens` defaults to 4096), even + * though the message compiler would happily include it. + */ +export function getAvailableInputTokens( + contextLength: number, + maxTokens: number, +): number { + return ( + contextLength - + getTokenCountingBufferSafety(contextLength) - + Math.min(MIN_RESPONSE_TOKENS, maxTokens) + ); +} + function pruneRawPromptFromTop( modelName: string, contextLength: number,