From 05350dc764f0ace7d50c2c93fc89b99a55a88487 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Thu, 7 May 2026 10:11:37 +0100 Subject: [PATCH] fix: migrate evaluator to ai sdk v6 output schema Use generateText with Output.object for structured evaluator responses after upgrading the ai package to v6. Co-authored-by: Codex --- .changeset/bright-llamas-pay.md | 5 ++ docs/EVALUATOR.md | 12 ++-- package.json | 2 +- pnpm-lock.yaml | 69 ++++++++++++---------- src/evaluators/evaluator-telemetry.spec.ts | 19 +++--- src/evaluators/evaluator.spec.ts | 51 ++++++++-------- src/evaluators/evaluator.ts | 18 +++--- 7 files changed, 95 insertions(+), 81 deletions(-) create mode 100644 .changeset/bright-llamas-pay.md diff --git a/.changeset/bright-llamas-pay.md b/.changeset/bright-llamas-pay.md new file mode 100644 index 0000000..e6929a3 --- /dev/null +++ b/.changeset/bright-llamas-pay.md @@ -0,0 +1,5 @@ +--- +"@loveholidays/eval-kit": patch +--- + +Migrate evaluator structured output generation to AI SDK v6. diff --git a/docs/EVALUATOR.md b/docs/EVALUATOR.md index 506acec..fc5313b 100644 --- a/docs/EVALUATOR.md +++ b/docs/EVALUATOR.md @@ -10,13 +10,13 @@ The Evaluator enables LLM-powered content evaluation with flexible prompt templa 1. **Evaluator** - Main evaluator class that orchestrates evaluation 2. **TemplateRenderer** - Handlebars-style template engine for prompts -3. **Vercel AI SDK** - Handles LLM API calls with structured output via generateObject +3. **Vercel AI SDK** - Handles LLM API calls with structured output via generateText 4. **Zod Schemas** - Dynamic schema generation based on score configuration ### Data Flow ``` -User Input → Template Rendering → Vercel AI SDK generateObject → Structured Result with Stats +User Input → Template Rendering → Vercel AI SDK generateText → Structured Result with Stats ``` ## Template Engine @@ -64,7 +64,7 @@ This enables automatic detection of required inputs based on the template. ## Structured Output with Vercel AI SDK -The evaluator uses Vercel AI SDK's `generateObject` function to ensure structured, validated responses from the LLM. +The evaluator uses Vercel AI SDK's `generateText` function with an output schema to ensure structured, validated responses from the LLM. ### How It Works @@ -192,7 +192,7 @@ Any provider compatible with Vercel AI SDK, including: ### Model Settings -Optional settings passed to `generateObject`: +Optional settings passed to `generateText`: ```typescript { @@ -394,7 +394,7 @@ The evaluator dynamically creates Zod schemas based on scoreConfig: - All score configurations (numeric, categorical, default) - Error conditions (API failures, undefined usage) - Processing stats tracking (execution time, token usage) -- Model settings passthrough to generateObject +- Model settings passthrough to generateText ## Performance Considerations @@ -408,6 +408,6 @@ Templates are rendered on every evaluation. For high-frequency evaluations, cons - Caching rendered templates if variables don't change - Using simpler templates without conditionals -### Vercel AI SDK generateObject +### Vercel AI SDK generateText The Vercel AI SDK handles response parsing efficiently with structured output. The Zod schema validation ensures type-safe responses without manual parsing overhead. diff --git a/package.json b/package.json index 2b888fc..e9be714 100644 --- a/package.json +++ b/package.json @@ -87,7 +87,7 @@ }, "dependencies": { "@xenova/transformers": "^2.17.2", - "ai": "^5.0.52", + "ai": "^6.0.175", "csv-parse": "^6.1.0", "csv-stringify": "^6.6.0", "fastest-levenshtein": "^1.0.16", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 92ad58a..a15782a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16,8 +16,8 @@ importers: specifier: ^2.17.2 version: 2.17.2 ai: - specifier: ^5.0.52 - version: 5.0.52(zod@3.25.76) + specifier: ^6.0.175 + version: 6.0.175(zod@3.25.76) csv-parse: specifier: ^6.1.0 version: 6.1.0 @@ -70,20 +70,20 @@ importers: packages: - '@ai-sdk/gateway@1.0.29': - resolution: {integrity: sha512-o9LtmBiG2WAgs3GAmL79F8idan/UupxHG8Tyr2gP4aUSOzflM0bsvfzozBp8x6WatQnOx+Pio7YNw45Y6I16iw==} + '@ai-sdk/gateway@3.0.110': + resolution: {integrity: sha512-sbv8+1L9/BRKydn8dMNwoMQKupA4iLJ9N+yvxgW6wMQ/94UepDf3FeYWMj/dLdzolAHZ6izRUP4s5WqQkmJ2Zg==} engines: {node: '>=18'} peerDependencies: - zod: ^3.25.76 || ^4 + zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/provider-utils@3.0.9': - resolution: {integrity: sha512-Pm571x5efqaI4hf9yW4KsVlDBDme8++UepZRnq+kqVBWWjgvGhQlzU8glaFq0YJEB9kkxZHbRRyVeHoV2sRYaQ==} + '@ai-sdk/provider-utils@4.0.26': + resolution: {integrity: sha512-CsKNLKsOpvPujRlIYvoz+Ybw+kGn7J4/fIZa/58+R7iWLLfwn6ifE2G6Yq8K9XvH/I/3bzaDAJ3NhRwEMsLBKQ==} engines: {node: '>=18'} peerDependencies: - zod: ^3.25.76 || ^4 + zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/provider@2.0.0': - resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==} + '@ai-sdk/provider@3.0.10': + resolution: {integrity: sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw==} engines: {node: '>=18'} '@babel/code-frame@7.27.1': @@ -939,8 +939,8 @@ packages: '@sinonjs/fake-timers@13.0.5': resolution: {integrity: sha512-36/hTbH2uaWuGVERyC6da9YwGWnzUZXuPro/F2LfsdOsLnCojz/iSH8MxUt/FD2S5XBSVPhmArFUXcpCQ2Hkiw==} - '@standard-schema/spec@1.0.0': - resolution: {integrity: sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==} + '@standard-schema/spec@1.1.0': + resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} '@tybys/wasm-util@0.10.1': resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==} @@ -1091,6 +1091,10 @@ packages: cpu: [x64] os: [win32] + '@vercel/oidc@3.2.0': + resolution: {integrity: sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug==} + engines: {node: '>= 20'} + '@volar/language-core@2.4.26': resolution: {integrity: sha512-hH0SMitMxnB43OZpyF1IFPS9bgb2I3bpCh76m2WEK7BE0A0EzpYsRp0CCH2xNKshr7kacU5TQBLYn4zj7CG60A==} @@ -1128,11 +1132,11 @@ packages: engines: {node: '>=0.4.0'} hasBin: true - ai@5.0.52: - resolution: {integrity: sha512-GLlRHjMlvN9+w7UYGxCpUQ8GgCRv5Z+JCprRH3Q8YbXJ/JyIc6EP9+YRUmQsyExX/qQsuehe7y/LLygarbSTOw==} + ai@6.0.175: + resolution: {integrity: sha512-6fFFHzbh6FIZnYc31V6osOxq25ABJYCShfG0O6ajHiA4FB/DgnPi1mP8cO5aAU3HNSbQHiMazdlh9bIsp97mVA==} engines: {node: '>=18'} peerDependencies: - zod: ^3.25.76 || ^4 + zod: ^3.25.76 || ^4.1.8 ajv-draft-04@1.0.0: resolution: {integrity: sha512-mv00Te6nmYbRp5DCwclxtt7yV/joXJPGS7nM+97GdxvuttCOfgI3K4U25zboyeX0O+myI8ERluxQe5wljMmVIw==} @@ -1513,8 +1517,8 @@ packages: events-universal@1.0.1: resolution: {integrity: sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==} - eventsource-parser@3.0.6: - resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==} + eventsource-parser@3.0.8: + resolution: {integrity: sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==} engines: {node: '>=18.0.0'} execa@5.1.1: @@ -2672,20 +2676,21 @@ packages: snapshots: - '@ai-sdk/gateway@1.0.29(zod@3.25.76)': + '@ai-sdk/gateway@3.0.110(zod@3.25.76)': dependencies: - '@ai-sdk/provider': 2.0.0 - '@ai-sdk/provider-utils': 3.0.9(zod@3.25.76) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.26(zod@3.25.76) + '@vercel/oidc': 3.2.0 zod: 3.25.76 - '@ai-sdk/provider-utils@3.0.9(zod@3.25.76)': + '@ai-sdk/provider-utils@4.0.26(zod@3.25.76)': dependencies: - '@ai-sdk/provider': 2.0.0 - '@standard-schema/spec': 1.0.0 - eventsource-parser: 3.0.6 + '@ai-sdk/provider': 3.0.10 + '@standard-schema/spec': 1.1.0 + eventsource-parser: 3.0.8 zod: 3.25.76 - '@ai-sdk/provider@2.0.0': + '@ai-sdk/provider@3.0.10': dependencies: json-schema: 0.4.0 @@ -3653,7 +3658,7 @@ snapshots: dependencies: '@sinonjs/commons': 3.0.1 - '@standard-schema/spec@1.0.0': {} + '@standard-schema/spec@1.1.0': {} '@tybys/wasm-util@0.10.1': dependencies: @@ -3777,6 +3782,8 @@ snapshots: '@unrs/resolver-binding-win32-x64-msvc@1.11.1': optional: true + '@vercel/oidc@3.2.0': {} + '@volar/language-core@2.4.26': dependencies: '@volar/source-map': 2.4.26 @@ -3836,11 +3843,11 @@ snapshots: acorn@8.15.0: {} - ai@5.0.52(zod@3.25.76): + ai@6.0.175(zod@3.25.76): dependencies: - '@ai-sdk/gateway': 1.0.29(zod@3.25.76) - '@ai-sdk/provider': 2.0.0 - '@ai-sdk/provider-utils': 3.0.9(zod@3.25.76) + '@ai-sdk/gateway': 3.0.110(zod@3.25.76) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.26(zod@3.25.76) '@opentelemetry/api': 1.9.0 zod: 3.25.76 @@ -4209,7 +4216,7 @@ snapshots: transitivePeerDependencies: - bare-abort-controller - eventsource-parser@3.0.6: {} + eventsource-parser@3.0.8: {} execa@5.1.1: dependencies: diff --git a/src/evaluators/evaluator-telemetry.spec.ts b/src/evaluators/evaluator-telemetry.spec.ts index 11396f1..bcffded 100644 --- a/src/evaluators/evaluator-telemetry.spec.ts +++ b/src/evaluators/evaluator-telemetry.spec.ts @@ -14,9 +14,12 @@ provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); provider.register(); // Mock the ai module -const mockGenerateObject = jest.fn(); +const mockGenerateText = jest.fn(); jest.unstable_mockModule("ai", () => ({ - generateObject: mockGenerateObject, + generateText: mockGenerateText, + Output: { + object: jest.fn((output) => ({ type: "object", ...output })), + }, })); const { Evaluator } = await import("./evaluator.js"); @@ -34,12 +37,12 @@ describe("Evaluator telemetry", () => { exporter.reset(); _resetTracer(); enableTelemetry(true); - mockGenerateObject.mockClear(); + mockGenerateText.mockClear(); }); it("should create a span with correct name and initial attributes on success", async () => { - mockGenerateObject.mockResolvedValue({ - object: { score: 85, feedback: "Good quality" }, + mockGenerateText.mockResolvedValue({ + output: { score: 85, feedback: "Good quality" }, usage: { inputTokens: 100, outputTokens: 20, totalTokens: 120 }, }); @@ -79,7 +82,7 @@ describe("Evaluator telemetry", () => { }); it("should record error attributes when evaluation fails", async () => { - mockGenerateObject.mockRejectedValue(new Error("API rate limited")); + mockGenerateText.mockRejectedValue(new Error("API rate limited")); const evaluator = new Evaluator({ name: "accuracy", @@ -112,8 +115,8 @@ describe("Evaluator telemetry", () => { }); it("should not break existing behavior when OTel is present", async () => { - mockGenerateObject.mockResolvedValue({ - object: { score: "excellent", feedback: "Top quality" }, + mockGenerateText.mockResolvedValue({ + output: { score: "excellent", feedback: "Top quality" }, usage: undefined, }); diff --git a/src/evaluators/evaluator.spec.ts b/src/evaluators/evaluator.spec.ts index 7ac782b..0f5b562 100644 --- a/src/evaluators/evaluator.spec.ts +++ b/src/evaluators/evaluator.spec.ts @@ -2,9 +2,12 @@ import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import type { LanguageModel } from "ai"; // Mock the ai module before importing Evaluator -const mockGenerateObject = jest.fn(); +const mockGenerateText = jest.fn(); jest.unstable_mockModule("ai", () => ({ - generateObject: mockGenerateObject, + generateText: mockGenerateText, + Output: { + object: jest.fn((output) => ({ type: "object", ...output })), + }, })); // Import after mocking @@ -22,7 +25,7 @@ const createMockModel = (): LanguageModel => { describe("Evaluator", () => { beforeEach(() => { - mockGenerateObject.mockClear(); + mockGenerateText.mockClear(); }); describe("constructor", () => { @@ -56,10 +59,10 @@ describe("Evaluator", () => { it("should evaluate with basic input", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: 90, feedback: "Excellent fluency" }, + mockGenerateText.mockResolvedValue({ + output: { score: 90, feedback: "Excellent fluency" }, usage: { inputTokens: 15, outputTokens: 10, totalTokens: 25 }, - } as any); + }); const evaluator = new Evaluator({ name: "fluency", @@ -85,10 +88,10 @@ describe("Evaluator", () => { it("should evaluate with reference text", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: 88, feedback: "Good accuracy" }, + mockGenerateText.mockResolvedValue({ + output: { score: 88, feedback: "Good accuracy" }, usage: undefined, - } as any); + }); const evaluator = new Evaluator({ name: "accuracy", @@ -109,7 +112,7 @@ Candidate: {{candidateText}} it("should handle API failures gracefully", async () => { const model = createMockModel(); - mockGenerateObject.mockRejectedValue(new Error("LLM API failed")); + mockGenerateText.mockRejectedValue(new Error("LLM API failed")); const evaluator = new Evaluator({ name: "test", @@ -129,10 +132,10 @@ Candidate: {{candidateText}} it("should handle categorical scores", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: "excellent", feedback: "Top quality" }, + mockGenerateText.mockResolvedValue({ + output: { score: "excellent", feedback: "Top quality" }, usage: undefined, - } as any); + }); const evaluator = new Evaluator({ name: "quality", @@ -154,10 +157,10 @@ Candidate: {{candidateText}} it("should handle numeric score config", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: 8, feedback: "Good" }, + mockGenerateText.mockResolvedValue({ + output: { score: 8, feedback: "Good" }, usage: undefined, - } as any); + }); const evaluator = new Evaluator({ name: "rating", @@ -181,10 +184,10 @@ Candidate: {{candidateText}} it("should track execution time", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: 85, feedback: "Good" }, + mockGenerateText.mockResolvedValue({ + output: { score: 85, feedback: "Good" }, usage: undefined, - } as any); + }); const evaluator = new Evaluator({ name: "test", @@ -200,13 +203,13 @@ Candidate: {{candidateText}} expect(typeof result.processingStats.executionTime).toBe("number"); }); - it("should pass model settings to generateObject", async () => { + it("should pass model settings to generateText", async () => { const model = createMockModel(); - mockGenerateObject.mockResolvedValue({ - object: { score: 85, feedback: "Good" }, + mockGenerateText.mockResolvedValue({ + output: { score: 85, feedback: "Good" }, usage: undefined, - } as any); + }); const evaluator = new Evaluator({ name: "test", @@ -223,7 +226,7 @@ Candidate: {{candidateText}} candidateText: "Test", }); - expect(mockGenerateObject).toHaveBeenCalledWith( + expect(mockGenerateText).toHaveBeenCalledWith( expect.objectContaining({ temperature: 0.7, maxOutputTokens: 500, diff --git a/src/evaluators/evaluator.ts b/src/evaluators/evaluator.ts index 571c337..09f0771 100644 --- a/src/evaluators/evaluator.ts +++ b/src/evaluators/evaluator.ts @@ -1,4 +1,4 @@ -import { generateObject } from "ai"; +import { generateText, Output } from "ai"; import { z } from "zod"; import { type EvalKitSpan, @@ -134,9 +134,9 @@ export class Evaluator { const prompt = this.buildPrompt(input); const schema = this.createSchema(); - const result = (await generateObject({ + const result = await generateText({ model: this.model, - schema, + output: Output.object({ schema }), prompt, temperature: this.modelSettings?.temperature, maxOutputTokens: this.modelSettings?.maxOutputTokens, @@ -146,23 +146,19 @@ export class Evaluator { frequencyPenalty: this.modelSettings?.frequencyPenalty, seed: this.modelSettings?.seed, experimental_telemetry: { isEnabled: isTelemetryEnabled() }, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any)) as unknown as { - object: { score: number | string; feedback: string }; - usage: unknown; - }; + }); const executionTime = Date.now() - startTime; const tokenUsage = this.extractTokenUsage(result.usage); this.setTokenAttributes(span, tokenUsage); - span.setAttribute("eval_kit.result.score", result.object.score); + span.setAttribute("eval_kit.result.score", result.output.score); return { evaluatorName: this.name, model: modelId, - score: result.object.score, - feedback: result.object.feedback, + score: result.output.score, + feedback: result.output.feedback, processingStats: { executionTime, tokenUsage }, }; }