diff --git a/.changeset/rich-token-usage.md b/.changeset/rich-token-usage.md new file mode 100644 index 0000000..20d33be --- /dev/null +++ b/.changeset/rich-token-usage.md @@ -0,0 +1,5 @@ +--- +"@loveholidays/eval-kit": minor +--- + +Expose detailed AI SDK token usage on evaluator results. diff --git a/src/evaluators/evaluator.spec.ts b/src/evaluators/evaluator.spec.ts index 0f5b562..786689b 100644 --- a/src/evaluators/evaluator.spec.ts +++ b/src/evaluators/evaluator.spec.ts @@ -85,6 +85,57 @@ describe("Evaluator", () => { }); }); + it("should preserve detailed token usage", async () => { + const model = createMockModel(); + + mockGenerateText.mockResolvedValue({ + output: { score: 90, feedback: "Excellent fluency" }, + usage: { + inputTokens: 150, + inputTokenDetails: { + noCacheTokens: 100, + cacheReadTokens: 40, + cacheWriteTokens: 10, + }, + outputTokens: 25, + outputTokenDetails: { + textTokens: 15, + reasoningTokens: 10, + }, + totalTokens: 175, + reasoningTokens: 10, + cachedInputTokens: 40, + }, + }); + + const evaluator = new Evaluator({ + name: "fluency", + model, + evaluationPrompt: "Rate the fluency of: {{candidateText}}", + }); + + const result = await evaluator.evaluate({ + candidateText: "The quick brown fox jumps over the lazy dog.", + }); + + expect(result.processingStats.tokenUsage).toEqual({ + inputTokens: 150, + inputTokenDetails: { + noCacheTokens: 100, + cacheReadTokens: 40, + cacheWriteTokens: 10, + }, + outputTokens: 25, + outputTokenDetails: { + textTokens: 15, + reasoningTokens: 10, + }, + totalTokens: 175, + reasoningTokens: 10, + cachedInputTokens: 40, + }); + }); + it("should evaluate with reference text", async () => { const model = createMockModel(); diff --git a/src/evaluators/evaluator.ts b/src/evaluators/evaluator.ts index 09f0771..9cd81e7 100644 --- a/src/evaluators/evaluator.ts +++ b/src/evaluators/evaluator.ts @@ -10,9 +10,24 @@ import type { EvaluationInput, EvaluatorConfig, EvaluatorResult, + TokenUsage, } from "../types/evaluator.js"; import { TemplateRenderer } from "../utils/template-engine.js"; +function getNumber(value: unknown): number | undefined { + return typeof value === "number" ? value : undefined; +} + +function getRecord(value: unknown): Record | undefined { + return value && typeof value === "object" + ? (value as Record) + : undefined; +} + +function hasNumberValue(value: Record): boolean { + return Object.values(value).some((entry) => typeof entry === "number"); +} + export class Evaluator { readonly name: string; readonly timeout?: number; @@ -171,9 +186,7 @@ export class Evaluator { private setTokenAttributes( span: EvalKitSpan, - tokenUsage: - | { inputTokens?: number; outputTokens?: number; totalTokens?: number } - | undefined, + tokenUsage: TokenUsage | undefined, ): void { if (tokenUsage?.inputTokens !== undefined) { span.setAttribute( @@ -207,22 +220,51 @@ export class Evaluator { }; } - private extractTokenUsage( - usage: unknown, - ): - | { inputTokens?: number; outputTokens?: number; totalTokens?: number } - | undefined { + private extractTokenUsage(usage: unknown): TokenUsage | undefined { if (!usage) return undefined; + const usageRecord = getRecord(usage); + if (!usageRecord) return undefined; + + const inputTokenDetails = this.extractInputTokenDetails( + usageRecord.inputTokenDetails, + ); + const outputTokenDetails = this.extractOutputTokenDetails( + usageRecord.outputTokenDetails, + ); + + return { + inputTokens: getNumber(usageRecord.inputTokens), + ...(inputTokenDetails ? { inputTokenDetails } : {}), + outputTokens: getNumber(usageRecord.outputTokens), + ...(outputTokenDetails ? { outputTokenDetails } : {}), + totalTokens: getNumber(usageRecord.totalTokens), + reasoningTokens: getNumber(usageRecord.reasoningTokens), + cachedInputTokens: getNumber(usageRecord.cachedInputTokens), + }; + } + + private extractInputTokenDetails( + details: unknown, + ): TokenUsage["inputTokenDetails"] | undefined { + const detailRecord = getRecord(details); + if (!detailRecord || !hasNumberValue(detailRecord)) return undefined; + + return { + noCacheTokens: getNumber(detailRecord.noCacheTokens), + cacheReadTokens: getNumber(detailRecord.cacheReadTokens), + cacheWriteTokens: getNumber(detailRecord.cacheWriteTokens), + }; + } + + private extractOutputTokenDetails( + details: unknown, + ): TokenUsage["outputTokenDetails"] | undefined { + const detailRecord = getRecord(details); + if (!detailRecord || !hasNumberValue(detailRecord)) return undefined; + return { - inputTokens: (usage as Record).inputTokens as - | number - | undefined, - outputTokens: (usage as Record).outputTokens as - | number - | undefined, - totalTokens: (usage as Record).totalTokens as - | number - | undefined, + textTokens: getNumber(detailRecord.textTokens), + reasoningTokens: getNumber(detailRecord.reasoningTokens), }; } diff --git a/src/types/evaluator.ts b/src/types/evaluator.ts index f0d71d9..34370a9 100644 --- a/src/types/evaluator.ts +++ b/src/types/evaluator.ts @@ -2,8 +2,19 @@ import type { LanguageModel } from "ai"; export interface TokenUsage { readonly inputTokens?: number; + readonly inputTokenDetails?: { + readonly noCacheTokens?: number; + readonly cacheReadTokens?: number; + readonly cacheWriteTokens?: number; + }; readonly outputTokens?: number; + readonly outputTokenDetails?: { + readonly textTokens?: number; + readonly reasoningTokens?: number; + }; readonly totalTokens?: number; + readonly reasoningTokens?: number; + readonly cachedInputTokens?: number; } export interface ProcessingStats {