diff --git a/README.md b/README.md index 5eee599..66b7ac2 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,15 @@ test: country: France ``` +### Output override + +```yml +prompts: + - What is the capital of France? +test: + - output: Paris # Used in JQA tests: https://eva-llm.github.io/eva-run/#jqa-judge-quality-audit-metrology-mode +``` + ### Asserts **NOTE!** All LLM asserts support natively [Dark Teaming](https://eva-llm.github.io/dark-teaming) to measure Epistemic Honesty via Symmetry Deviation, and extend Promptfoo format with field `must_fail` diff --git a/docs/src/pages/index.md b/docs/src/pages/index.md index 98aff9c..03dfeaa 100644 --- a/docs/src/pages/index.md +++ b/docs/src/pages/index.md @@ -63,6 +63,15 @@ test: country: France ``` +### Output override + +```yml +prompts: + - What is the capital of France? +test: + - output: Paris # Used in JQA tests: https://eva-llm.github.io/eva-run/#jqa-judge-quality-audit-metrology-mode +``` + ### Asserts **NOTE!** All LLM asserts support natively [Dark Teaming](https://eva-llm.github.io/dark-teaming) to measure Epistemic Honesty via Symmetry Deviation, and extend Promptfoo format with field `must_fail` diff --git a/package.json b/package.json index 6674d33..5f4974e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@eva-llm/eva-parser", - "version": "1.0.3", + "version": "1.0.4", "description": "A converter for Promptfoo test formats and into the EVA-LLM ecosystem", "main": "dst/index.js", "types": "dst/index.d.ts", diff --git a/src/index.ts b/src/index.ts index 6f909cf..57cd6d6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,12 +4,19 @@ import { parse } from 'yaml'; import { ASSERT_NAMES, type TAssert, + type TEvaTest, + type TEvaTestWithPrompt, type TProviderObj, + type TTest, type TVercelOptions, } from './types'; -export * from './types'; +export { + ASSERT_NAMES, + type TTest, + type TAssert, +} from './types'; const parseProvider = (providerObj: string | TProviderObj) => { let options: TVercelOptions = {}; @@ -86,7 +93,7 @@ const parseAssert = (fooAssert: any): Omit => { export function parsePromptfoo(yamlContent: string) { const promptfoo = parse(yamlContent); - const evaTests = []; + const evaTests: TEvaTest[] = []; for (const fooTest of promptfoo.tests || []) { for (let i = 0; i < (fooTest.times || 1); i++) { @@ -95,11 +102,15 @@ export function parsePromptfoo(yamlContent: string) { continue; } - const evaTest = { + const evaTest: TEvaTest = { vars: fooTest.vars, asserts: [] as TAssert[], }; + if (fooTest.output !== undefined) { + evaTest.output = fooTest.output; + }; + for (const fooAssert of fooTest.assert) { if (!Object.values(ASSERT_NAMES).includes(fooAssert.type)) { continue; @@ -130,18 +141,24 @@ export function parsePromptfoo(yamlContent: string) { return []; } - const evaTestsWithPrompts = []; + const evaTestsWithPrompts: TEvaTestWithPrompt[] = []; for (const fooPrompt of promptfoo.prompts || []) { for (const evaTest of evaTests) { - evaTestsWithPrompts.push({ + const evaTestWithPrompt: TEvaTestWithPrompt = { prompt: injectVars(fooPrompt, evaTest.vars), asserts: evaTest.asserts, - }); + }; + + if (evaTest.output !== undefined) { + evaTestWithPrompt.output = evaTest.output; + } + + evaTestsWithPrompts.push(evaTestWithPrompt); } } - const finalTests = []; + const evaRunTasks: TTest[] = []; for (const providerObj of promptfoo.providers || []) { const parsedProvider = parseProvider(providerObj); @@ -156,12 +173,20 @@ export function parsePromptfoo(yamlContent: string) { return assert; }); - finalTests.push({ - ...parsedProvider, - prompt: evaTestWithPrompt.prompt, - asserts, - }); + if (evaTestWithPrompt.output !== undefined) { + evaRunTasks.push({ + prompt: evaTestWithPrompt.prompt, + output: evaTestWithPrompt.output, + asserts, + }); + } else { + evaRunTasks.push({ + ...parsedProvider, + prompt: evaTestWithPrompt.prompt, + asserts, + }); + } } } - return finalTests; + return evaRunTasks; } diff --git a/src/types.ts b/src/types.ts index 7ab5df4..21689fc 100644 --- a/src/types.ts +++ b/src/types.ts @@ -23,6 +23,30 @@ export type TAssert = { case_sensitive?: boolean; } +export type TEvaTest = { + vars: undefined | Record; + output?: string; + asserts: TAssert[]; +} + +export type TEvaTestWithPrompt = { + prompt: string; + output?: string; + asserts: TAssert[]; +} + +export type TTest = { + prompt: string; + output: string; + asserts: TAssert[]; +} | { + provider: string; + model: string; + options?: TVercelOptions + prompt: string; + asserts: TAssert[]; +} + export type TProviderObj = { id: string; config: Record; diff --git a/tests/index.test.ts b/tests/index.test.ts index bbe57e2..7ff91e0 100644 --- a/tests/index.test.ts +++ b/tests/index.test.ts @@ -417,8 +417,8 @@ tests: value: Hi `; const results = parsePromptfoo(yaml); - expect(results.find(r => r.provider === 'openai')).toBeDefined(); - expect(results.find(r => r.provider === 'anthropic')).toBeDefined(); + expect(results.find(r => 'provider' in r && r.provider === 'openai')).toBeDefined(); + expect(results.find(r => 'provider' in r && r.provider === 'anthropic')).toBeDefined(); }); }); @@ -453,6 +453,162 @@ tests: }); }); + describe('test with output (JQA)', () => { + it('produces result with output and no provider/model when test has output', () => { + const yaml = ` +prompts: ['Say hi'] +providers: ['openai:gpt-4o'] +tests: + - output: "Hello there" + assert: + - type: contains + value: Hello +`; + const results = parsePromptfoo(yaml); + expect(results).toHaveLength(1); + expect(results[0]).toMatchObject({ + prompt: 'Say hi', + output: 'Hello there', + }); + expect(results[0].asserts[0].criteria).toBe('Hello'); + expect('provider' in results[0]).toBe(false); + expect('model' in results[0]).toBe(false); + }); + + it('produces result with provider/model and no output when test has no output', () => { + const yaml = ` +prompts: ['Say hi'] +providers: ['openai:gpt-4o'] +tests: + - assert: + - type: contains + value: Hello +`; + const results = parsePromptfoo(yaml); + expect(results).toHaveLength(1); + expect('provider' in results[0] && results[0].provider).toBe('openai'); + expect('model' in results[0] && results[0].model).toBe('gpt-4o'); + expect('output' in results[0]).toBe(false); + }); + + it('handles mix of tests with and without output', () => { + const yaml = ` +prompts: ['Hello'] +providers: ['openai:gpt-4o'] +tests: + - output: "Pre-generated response" + assert: + - type: contains + value: response + - assert: + - type: contains + value: hi +`; + const results = parsePromptfoo(yaml); + expect(results).toHaveLength(2); + + const withOutput = results[0]; + expect('output' in withOutput).toBe(true); + expect('provider' in withOutput).toBe(false); + + const withoutOutput = results[1]; + expect('output' in withoutOutput).toBe(false); + expect('provider' in withoutOutput && withoutOutput.provider).toBe('openai'); + }); + + it('output test still inherits provider in asserts that lack one', () => { + const yaml = ` +prompts: ['Hello'] +providers: ['openai:gpt-4o'] +tests: + - output: "Some response" + assert: + - type: g-eval + value: The response is friendly +`; + const results = parsePromptfoo(yaml); + expect(results[0].asserts[0].provider).toBe('openai'); + expect(results[0].asserts[0].model).toBe('gpt-4o'); + }); + + it('output test with assert-level provider keeps assert provider', () => { + const yaml = ` +prompts: ['Hello'] +providers: ['openai:gpt-4o'] +tests: + - output: "Some response" + assert: + - type: g-eval + value: The response is friendly + provider: anthropic:claude-3-5-sonnet +`; + const results = parsePromptfoo(yaml); + expect(results[0].asserts[0].provider).toBe('anthropic'); + expect(results[0].asserts[0].model).toBe('claude-3-5-sonnet'); + }); + + it('output test with multiple providers produces one result per provider', () => { + const yaml = ` +prompts: ['Hello'] +providers: + - openai:gpt-4o + - anthropic:claude-3-5-sonnet +tests: + - output: "Pre-generated" + assert: + - type: contains + value: generated +`; + const results = parsePromptfoo(yaml); + // 2 providers × 1 prompt × 1 test = 2 + expect(results).toHaveLength(2); + // Both should have output, no provider on the result + for (const r of results) { + expect('output' in r && r.output).toBe('Pre-generated'); + expect('provider' in r).toBe(false); + } + }); + + it('output test with vars still injects vars into prompt', () => { + const yaml = ` +prompts: ['Tell me about {{topic}}'] +providers: ['openai:gpt-4o'] +tests: + - vars: + topic: dolphins + output: "Dolphins are amazing marine mammals" + assert: + - type: contains + value: dolphin +`; + const results = parsePromptfoo(yaml); + expect(results[0].prompt).toBe('Tell me about dolphins'); + expect('output' in results[0] && results[0].output).toBe('Dolphins are amazing marine mammals'); + }); + + it('output test cross-product with multiple prompts', () => { + const yaml = ` +prompts: + - 'Prompt A' + - 'Prompt B' +providers: ['openai:gpt-4o'] +tests: + - output: "Fixed response" + assert: + - type: contains + value: response +`; + const results = parsePromptfoo(yaml); + // 1 provider × 2 prompts × 1 test = 2 + expect(results).toHaveLength(2); + expect(results[0].prompt).toBe('Prompt A'); + expect(results[1].prompt).toBe('Prompt B'); + for (const r of results) { + expect('output' in r && r.output).toBe('Fixed response'); + } + }); + }); + describe('provider as object', () => { it('parses top-level provider object with temperature', () => { const yaml = ` @@ -467,9 +623,9 @@ tests: value: Hi `; const results = parsePromptfoo(yaml); - expect(results[0].provider).toBe('openai'); - expect(results[0].model).toBe('gpt-4o'); - expect(results[0].options).toEqual({ temperature: 0.5 }); + expect('provider' in results[0] && results[0].provider).toBe('openai'); + expect('model' in results[0] && results[0].model).toBe('gpt-4o'); + expect('options' in results[0] && results[0].options).toEqual({ temperature: 0.5 }); }); it('parses top-level provider object without temperature', () => { @@ -484,7 +640,7 @@ tests: value: Hi `; const results = parsePromptfoo(yaml); - expect(results[0].options).toEqual({}); + expect('options' in results[0] && results[0].options).toEqual({}); }); }); });