diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 53a7ddc..3f120df 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -1,12 +1,15 @@ -name: Deploy Docusaurus +name: CI/CD Pipeline on: push: branches: - main + pull_request: + branches: + - main jobs: - build: + test-and-deploy: runs-on: ubuntu-latest permissions: contents: write @@ -22,17 +25,23 @@ jobs: with: node-version: 22 cache: 'pnpm' - cache-dependency-path: docs/pnpm-lock.yaml - - name: Install dependencies + - name: Install Root Dependencies + run: pnpm install --frozen-lockfile + + - name: Run Eva-Judge Tests + run: pnpm run test:coverage + + - name: Install Docs dependencies run: pnpm install --frozen-lockfile working-directory: docs - - name: Build + - name: Build Docs run: pnpm run build working-directory: docs - - name: Deploy + - name: Deploy to GH Pages + if: github.event_name == 'push' && github.ref == 'refs/heads/main' uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/tests/dark_teaming.yaml b/examples/dark_teaming.yaml similarity index 100% rename from tests/dark_teaming.yaml rename to examples/dark_teaming.yaml diff --git a/tests/promptfooconfig.yaml b/examples/promptfooconfig.yaml similarity index 100% rename from tests/promptfooconfig.yaml rename to examples/promptfooconfig.yaml diff --git a/tests/temperature_sensitivy.yaml b/examples/temperature_sensitivy.yaml similarity index 100% rename from tests/temperature_sensitivy.yaml rename to examples/temperature_sensitivy.yaml diff --git a/tests/temperature_sensitivy_rus.yaml b/examples/temperature_sensitivy_rus.yaml similarity index 100% rename from tests/temperature_sensitivy_rus.yaml rename to examples/temperature_sensitivy_rus.yaml diff --git a/jest.config.js b/jest.config.js new file mode 100644 index 0000000..4463721 --- /dev/null +++ b/jest.config.js @@ -0,0 +1,41 @@ +const { createDefaultPreset } = require('ts-jest'); + +const tsJestTransformCfg = createDefaultPreset().transform; + +/** @type {import("jest").Config} **/ +module.exports = { + testEnvironment: 'node', + transform: { + '^.+\\.tsx?$': ['ts-jest', { + tsconfig: { + allowSyntheticDefaultImports: true, + }, + }], + }, + moduleNameMapper: { + '^utils$': '/src/utils', + '^schemas$': '/src/schemas', + }, + testPathIgnorePatterns: [ + "/node_modules/", + "/src/", + "/dst/", + ], + coverageDirectory: "coverage", + coverageProvider: "v8", + collectCoverageFrom: [ + "src/**/*.{ts,js}", + "!src/**/*.d.ts", + "!src/types/**", + "!**/node_modules/**" + ], + coverageReporters: ["text", "lcov", "clover"], + coverageThreshold: { + global: { + branches: 95, + functions: 90, + lines: 85, + statements: 85, + }, + }, +}; diff --git a/package.json b/package.json index e84e9c5..e9ffc5f 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "bin": "node ./dst/index.js", "build": "tsc", "prepare": "pnpm run build", - "test": "echo \"Error: no test specified\" && exit 1" + "test": "jest", + "test:coverage": "jest --coverage" }, "repository": { "type": "git", diff --git a/tests/db.test.ts b/tests/db.test.ts new file mode 100644 index 0000000..7796643 --- /dev/null +++ b/tests/db.test.ts @@ -0,0 +1,162 @@ +import type { ITestResult, IAssertResult } from '../src/types'; + +const mockSqlTagged = jest.fn(); +const mockSqlIdentifier = jest.fn((...args: any[]) => args[0]); + +const mockSql = Object.assign(mockSqlTagged, { + call: mockSqlTagged, +}); + +// Make `sql('TableName')` work for identifiers and `sql(array)` for IN lists +// while `sql`...`` (tagged template) calls mockSqlTagged +const sqlProxy = new Proxy(mockSql, { + apply(_target, _thisArg, argArray) { + // Tagged template: first arg is TemplateStringsArray + if (Array.isArray(argArray[0]) && 'raw' in argArray[0]) { + return mockSqlTagged(...argArray); + } + // Regular call: sql('TableName') or sql(array) + return mockSqlIdentifier(...argArray); + }, +}); + +jest.mock('postgres', () => ({ + __esModule: true, + default: jest.fn(() => sqlProxy), +})); + +// Must import after mock setup +import { getFinishedTests, getFinishedAsserts } from '../src/db'; + +const FAKE_TEST_RESULTS: ITestResult[] = [ + { + id: 'test-1', + run_id: 'run-1', + provider: 'openai', + model: 'gpt-4', + prompt: 'hello', + output: 'world', + passed: true, + metadata: null, + started_at: new Date('2025-01-01'), + assert_started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 100, + assert_diff_ms: 50, + output_diff_ms: 50, + }, +]; + +const FAKE_ASSERT_RESULTS: IAssertResult[] = [ + { + id: 'assert-1', + test_id: 'test-1', + run_id: 'run-1', + name: 'accuracy', + criteria: 'must be accurate', + passed: true, + score: 0.9, + reason: 'looks good', + threshold: 0.5, + metadata: null, + started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 30, + }, +]; + +describe('db', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + describe('getFinishedTests', () => { + it('should query TestResult table with run_id and test ids', async () => { + mockSqlTagged.mockResolvedValueOnce(FAKE_TEST_RESULTS); + + const runId = 'run-1'; + const testIds = ['test-1', 'test-2']; + const result = await getFinishedTests(runId, testIds); + + expect(result).toEqual(FAKE_TEST_RESULTS); + expect(mockSqlTagged).toHaveBeenCalledTimes(1); + + // Verify template strings contain the expected SQL fragments + const templateStrings = mockSqlTagged.mock.calls[0][0]; + const joined = templateStrings.join('??'); + expect(joined).toContain('SELECT'); + expect(joined).toContain('FROM'); + expect(joined).toContain('WHERE'); + expect(joined).toContain('run_id'); + expect(joined).toContain('IN'); + + // Verify interpolated values include the table identifier, runId, and testIds + const interpolatedValues = mockSqlTagged.mock.calls[0].slice(1); + expect(interpolatedValues).toContain(runId); + }); + + it('should pass the table name through sql identifier helper', async () => { + mockSqlTagged.mockResolvedValueOnce([]); + + await getFinishedTests('run-1', ['test-1']); + + expect(mockSqlIdentifier).toHaveBeenCalledWith('TestResult'); + }); + + it('should pass testIds through sql helper for IN clause', async () => { + mockSqlTagged.mockResolvedValueOnce([]); + const testIds = ['t-1', 't-2', 't-3']; + + await getFinishedTests('run-1', testIds); + + expect(mockSqlIdentifier).toHaveBeenCalledWith(testIds); + }); + }); + + describe('getFinishedAsserts', () => { + it('should query AssertResult table with run_id and test ids', async () => { + mockSqlTagged.mockResolvedValueOnce(FAKE_ASSERT_RESULTS); + + const runId = 'run-1'; + const testIds = ['test-1']; + const result = await getFinishedAsserts(runId, testIds); + + expect(result).toEqual(FAKE_ASSERT_RESULTS); + expect(mockSqlTagged).toHaveBeenCalledTimes(1); + + const templateStrings = mockSqlTagged.mock.calls[0][0]; + const joined = templateStrings.join('??'); + expect(joined).toContain('SELECT'); + expect(joined).toContain('FROM'); + expect(joined).toContain('WHERE'); + expect(joined).toContain('run_id'); + expect(joined).toContain('test_id'); + expect(joined).toContain('IN'); + }); + + it('should pass the table name through sql identifier helper', async () => { + mockSqlTagged.mockResolvedValueOnce([]); + + await getFinishedAsserts('run-1', ['test-1']); + + expect(mockSqlIdentifier).toHaveBeenCalledWith('AssertResult'); + }); + + it('should pass testIds through sql helper for IN clause', async () => { + mockSqlTagged.mockResolvedValueOnce([]); + const testIds = ['t-1', 't-2']; + + await getFinishedAsserts('run-1', testIds); + + expect(mockSqlIdentifier).toHaveBeenCalledWith(testIds); + }); + + it('should return empty array when no results', async () => { + mockSqlTagged.mockResolvedValueOnce([]); + + const result = await getFinishedAsserts('run-x', ['t-none']); + + expect(result).toEqual([]); + }); + }); +}); diff --git a/tests/index.test.ts b/tests/index.test.ts new file mode 100644 index 0000000..d69a5ba --- /dev/null +++ b/tests/index.test.ts @@ -0,0 +1,486 @@ +import type { ITestResult, TReport, IEpistemicReport, IAssertResult } from '../src/types'; + +let capturedAction: (suite?: string) => Promise; + +const mockCommand: Record = { + name: jest.fn().mockReturnThis(), + version: jest.fn().mockReturnThis(), + description: jest.fn().mockReturnThis(), + command: jest.fn().mockReturnThis(), + argument: jest.fn().mockReturnThis(), + action: jest.fn((fn: any) => { + capturedAction = fn; + return mockCommand; + }), + parse: jest.fn(), +}; + +jest.mock('commander', () => ({ + Command: jest.fn(() => mockCommand), +})); + +const mockIntro = jest.fn(); +const mockText = jest.fn(); +const mockIsCancel = jest.fn(() => false); +const mockSpinner = jest.fn(() => ({ start: jest.fn(), stop: jest.fn() })); +const mockCancel = jest.fn(); +const mockOutro = jest.fn(); + +jest.mock('@clack/prompts', () => ({ + intro: (...args: any[]) => mockIntro(...args), + text: (...args: any[]) => mockText(...args), + isCancel: (value: unknown) => mockIsCancel(value), + spinner: () => mockSpinner(), + cancel: (...args: any[]) => mockCancel(...args), + outro: (...args: any[]) => mockOutro(...args), +})); + +jest.mock('picocolors', () => ({ + __esModule: true, + default: { + bgCyan: (s: string) => s, + black: (s: string) => s, + yellow: (s: string) => s, + red: (s: string) => s, + green: (s: string) => s, + blue: (s: string) => s, + cyan: (s: string) => s, + magenta: (s: string) => s, + bold: (s: string) => s, + }, +})); + +const mockParsePromptfoo = jest.fn(() => [{ task: 'parsed' }]); +jest.mock('@eva-llm/eva-parser', () => ({ + parsePromptfoo: (content: string) => mockParsePromptfoo(content), +})); + +const mockReadFileSync = jest.fn(() => 'file content'); +jest.mock('node:fs', () => ({ + readFileSync: (path: string, encoding: string) => mockReadFileSync(path, encoding), +})); + +const mockUuidv7 = jest.fn(() => 'mock-uuid-v7'); +jest.mock('uuidv7', () => ({ + uuidv7: () => mockUuidv7(), +})); + +const mockRequest = jest.fn(); +jest.mock('undici', () => ({ + request: (...args: any[]) => mockRequest(...args), +})); + +const mockObserve = jest.fn(); +jest.mock('../src/utils', () => ({ + observe: (...args: any[]) => mockObserve(...args), +})); + +// Load the module — registers the commander action via the mock +require('../src/index'); + +const makeReport = (overrides: Partial = {}): TReport => ({ + testsAmount: 2, + passedTestsAmount: 2, + failedTests: [], + epistemicTests: [], + missedTestsAmount: 0, + ...overrides, +}); + +const makeTestResult = (overrides: Partial = {}): ITestResult => ({ + id: 'test-1', + run_id: 'run-1', + provider: 'openai', + model: 'gpt-4', + prompt: 'test prompt', + output: 'test output', + passed: false, + metadata: null, + started_at: new Date('2025-01-01'), + assert_started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 100, + assert_diff_ms: 50, + output_diff_ms: 50, + ...overrides, +}); + +const makeAssertResult = (overrides: Partial = {}): IAssertResult => ({ + id: 'assert-1', + test_id: 'test-1', + run_id: 'run-1', + name: 'test-assert', + criteria: 'test criteria', + passed: false, + score: 0, + reason: 'it failed', + threshold: 0.5, + metadata: null, + started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 50, + ...overrides, +}); + +describe('eva-cli index module', () => { + let consoleSpy: jest.SpiedFunction; + let exitSpy: jest.SpiedFunction; + + beforeEach(() => { + jest.clearAllMocks(); + consoleSpy = jest.spyOn(console, 'log').mockImplementation(); + exitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { + throw new Error(`process.exit(${code})`); + }) as any); + }); + + afterEach(() => { + consoleSpy.mockRestore(); + exitSpy.mockRestore(); + }); + + describe('run command', () => { + const setupMocks = (report: TReport = makeReport()) => { + mockRequest.mockResolvedValue({ + statusCode: 200, + body: { + json: () => Promise.resolve({ test_ids: ['tid-1', 'tid-2'] }), + text: () => Promise.resolve(''), + }, + }); + mockObserve.mockResolvedValue(report); + }; + + it('should read and parse the suite file', async () => { + setupMocks(); + + await expect(capturedAction('my-suite.yaml')).rejects.toThrow('process.exit'); + + expect(mockReadFileSync).toHaveBeenCalledWith('my-suite.yaml', 'utf-8'); + expect(mockParsePromptfoo).toHaveBeenCalledWith('file content'); + }); + + it('should submit parsed tasks to the eva-run cluster', async () => { + setupMocks(); + + await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit'); + + expect(mockRequest).toHaveBeenCalledWith( + 'http://localhost:3000/eval', + expect.objectContaining({ + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify([{ run_id: 'mock-uuid-v7', task: 'parsed' }]), + bodyTimeout: 0, + headersTimeout: 0, + }), + ); + }); + + it('should observe test results with run_id and test_ids', async () => { + setupMocks(); + + await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit'); + + expect(mockObserve).toHaveBeenCalledWith('mock-uuid-v7', ['tid-1', 'tid-2']); + }); + + it('should prompt for suite path when not provided as argument', async () => { + setupMocks(); + mockText.mockResolvedValue('interactive-suite.yaml'); + + await expect(capturedAction(undefined)).rejects.toThrow('process.exit'); + + expect(mockText).toHaveBeenCalledWith( + expect.objectContaining({ message: 'Provide path to the test suite:' }), + ); + expect(mockReadFileSync).toHaveBeenCalledWith('interactive-suite.yaml', 'utf-8'); + }); + + it('should validate that prompt input is not empty', async () => { + setupMocks(); + mockText.mockResolvedValue('suite.yaml'); + + await expect(capturedAction(undefined)).rejects.toThrow('process.exit'); + + const textConfig = mockText.mock.calls[0][0]; + expect(textConfig.validate('')).toBe('Please enter a path'); + expect(textConfig.validate('some-path')).toBeUndefined(); + }); + + it('should exit gracefully when user cancels the prompt', async () => { + mockIsCancel.mockReturnValueOnce(true); + mockText.mockResolvedValue(Symbol('cancel')); + + await expect(capturedAction(undefined)).rejects.toThrow('process.exit(0)'); + + expect(mockCancel).toHaveBeenCalledWith('Operation cancelled.'); + expect(mockRequest).not.toHaveBeenCalled(); + }); + + it('should throw on non-200 API response', async () => { + mockRequest.mockResolvedValue({ + statusCode: 500, + body: { + text: () => Promise.resolve('Internal Server Error'), + }, + }); + + await expect(capturedAction('suite.yaml')).rejects.toThrow( + 'Server responded with 500: Internal Server Error', + ); + }); + + it('should call outro and exit after printing report', async () => { + setupMocks(); + + await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit'); + + expect(mockOutro).toHaveBeenCalled(); + expect(exitSpy).toHaveBeenCalledWith(0); + }); + + it('should log submission and test count messages', async () => { + setupMocks(); + + await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit'); + + const logs = consoleSpy.mock.calls.map((c) => c[0]); + expect(logs).toContain('Submitting to eva-run cluster (localhost:3000)...'); + expect(logs).toContain('2 test(s) are started...'); + }); + }); + + describe('printReport', () => { + const setupAndRun = async (report: TReport) => { + mockRequest.mockResolvedValue({ + statusCode: 200, + body: { json: () => Promise.resolve({ test_ids: ['t1'] }) }, + }); + mockObserve.mockResolvedValue(report); + + await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit'); + }; + + it('should print summary for all-passing report', async () => { + await setupAndRun( + makeReport({ + testsAmount: 3, + passedTestsAmount: 3, + failedTests: [], + epistemicTests: [], + missedTestsAmount: 0, + }), + ); + + const logs = consoleSpy.mock.calls.map((c) => c[0]).join('\n'); + expect(logs).toContain('Passed tests: 3'); + expect(logs).toContain('Failed tests: 0'); + expect(logs).toContain('Total tests: 3'); + expect(logs).toContain('Missed tests: 0'); + expect(logs).not.toContain('Failed test details:'); + }); + + it('should print failed test details with asserts', async () => { + const failedTest = makeTestResult({ + id: 'ft-1', + provider: 'anthropic', + model: 'claude-3', + prompt: 'why?', + output: 'because', + passed: false, + asserts: [ + makeAssertResult({ + criteria: 'must be polite', + reason: 'was rude', + passed: false, + score: 0.2, + threshold: 0.8, + }), + ], + }); + + await setupAndRun( + makeReport({ + testsAmount: 1, + passedTestsAmount: 0, + failedTests: [failedTest], + }), + ); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('Failed test details:'); + expect(logs).toContain('why?'); + expect(logs).toContain('because'); + expect(logs).toContain('must be polite'); + expect(logs).toContain('was rude'); + expect(logs).toContain('Failed tests: 1'); + }); + + it('should print epistemic test details', async () => { + const epistemicTest: IEpistemicReport = { + ...makeTestResult({ id: 'et-1', prompt: 'epistemic q', output: 'answer' }), + honesty: 0.875, + deviation: 0.125, + }; + + await setupAndRun( + makeReport({ + testsAmount: 1, + passedTestsAmount: 1, + failedTests: [], + epistemicTests: [epistemicTest], + }), + ); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('Epistemic test details:'); + expect(logs).toContain('epistemic q'); + expect(logs).toContain('answer'); + expect(logs).toContain('0.875'); + expect(logs).toContain('0.125'); + expect(logs).toContain('Epistemic tests: 1'); + }); + + it('should print missed tests count', async () => { + await setupAndRun( + makeReport({ + testsAmount: 5, + passedTestsAmount: 3, + missedTestsAmount: 2, + }), + ); + + const logs = consoleSpy.mock.calls.map((c) => c[0]).join('\n'); + expect(logs).toContain('Missed tests: 2'); + }); + + it('should display temperature in model info for failed tests', async () => { + const test = makeTestResult({ + provider: 'openai', + model: 'gpt-4', + metadata: { temperature: 0.7 }, + asserts: [makeAssertResult()], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('openai | gpt-4 | T=0.7'); + }); + + it('should display topP in model info for failed tests', async () => { + const test = makeTestResult({ + provider: 'openai', + model: 'gpt-4', + metadata: { topP: 0.9 }, + asserts: [makeAssertResult()], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('openai | gpt-4 | topP=0.9'); + }); + + it('should display topK in model info for failed tests', async () => { + const test = makeTestResult({ + provider: 'openai', + model: 'gpt-4', + metadata: { topK: 50 }, + asserts: [makeAssertResult()], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('openai | gpt-4 | topK=50'); + }); + + it('should display provider and model without extras when no relevant metadata', async () => { + const test = makeTestResult({ + provider: 'anthropic', + model: 'claude-3', + metadata: null, + asserts: [makeAssertResult()], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('anthropic | claude-3'); + expect(logs).not.toMatch(/T=|topP=|topK=/); + }); + + it('should print must_fail metadata in assert summary', async () => { + const test = makeTestResult({ + asserts: [ + makeAssertResult({ + passed: true, + score: 0.9, + threshold: 0.5, + metadata: { must_fail: true }, + }), + ], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('must_fail: true'); + }); + + it('should omit must_fail when assert metadata is null', async () => { + const test = makeTestResult({ + asserts: [makeAssertResult({ metadata: null })], + }); + + await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).not.toContain('must_fail'); + }); + + it('should display model info for epistemic tests', async () => { + const epistemicTest: IEpistemicReport = { + ...makeTestResult({ + provider: 'google', + model: 'gemini-pro', + metadata: { temperature: 0.3 }, + }), + honesty: 1, + deviation: 0, + }; + + await setupAndRun(makeReport({ epistemicTests: [epistemicTest] })); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('google | gemini-pro | T=0.3'); + }); + + it('should handle multiple failed tests', async () => { + const tests = [ + makeTestResult({ + id: 'ft-1', + prompt: 'prompt A', + asserts: [makeAssertResult({ id: 'a1', criteria: 'criterion A' })], + }), + makeTestResult({ + id: 'ft-2', + prompt: 'prompt B', + asserts: [makeAssertResult({ id: 'a2', criteria: 'criterion B' })], + }), + ]; + + await setupAndRun( + makeReport({ testsAmount: 2, passedTestsAmount: 0, failedTests: tests }), + ); + + const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n'); + expect(logs).toContain('prompt A'); + expect(logs).toContain('prompt B'); + expect(logs).toContain('criterion A'); + expect(logs).toContain('criterion B'); + }); + }); +}); diff --git a/tests/utils.test.ts b/tests/utils.test.ts new file mode 100644 index 0000000..18d6ce0 --- /dev/null +++ b/tests/utils.test.ts @@ -0,0 +1,355 @@ +import { xnor, observe } from '../src/utils'; +import { getFinishedTests, getFinishedAsserts } from '../src/db'; +import type { ITestResult, IAssertResult } from '../src/types'; + +jest.mock('picocolors', () => ({ + __esModule: true, + default: { yellow: jest.fn((s: string) => s) }, +})); +jest.mock('../src/db', () => ({ + getFinishedTests: jest.fn(), + getFinishedAsserts: jest.fn(), +})); + +const mockedGetFinishedTests = getFinishedTests as jest.Mock; +const mockedGetFinishedAsserts = getFinishedAsserts as jest.Mock; + +const makeTestResult = (overrides: Partial = {}): ITestResult => ({ + id: 'test-1', + run_id: 'run-1', + provider: 'openai', + model: 'gpt-4', + prompt: 'test prompt', + output: 'test output', + passed: true, + metadata: null, + started_at: new Date('2025-01-01'), + assert_started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 100, + assert_diff_ms: 50, + output_diff_ms: 50, + ...overrides, +}); + +const makeAssertResult = (overrides: Partial = {}): IAssertResult => ({ + id: 'assert-1', + test_id: 'test-1', + run_id: 'run-1', + name: 'test-assert', + criteria: 'test criteria', + passed: true, + score: 1, + reason: 'passed', + threshold: 0.5, + metadata: null, + started_at: new Date('2025-01-01'), + finished_at: new Date('2025-01-01'), + diff_ms: 50, + ...overrides, +}); + +describe('utils module', () => { + describe('xnor', () => { + it('should return true when both are true', () => { + expect(xnor(true, true)).toBe(true); + }); + + it('should return true when both are false', () => { + expect(xnor(false, false)).toBe(true); + }); + + it('should return false when first is true and second is false', () => { + expect(xnor(true, false)).toBe(false); + }); + + it('should return false when first is false and second is true', () => { + expect(xnor(false, true)).toBe(false); + }); + }); + + describe('observe', () => { + let consoleSpy: jest.SpiedFunction; + + beforeEach(() => { + jest.useFakeTimers(); + jest.clearAllMocks(); + consoleSpy = jest.spyOn(console, 'log').mockImplementation(); + }); + + afterEach(() => { + jest.useRealTimers(); + consoleSpy.mockRestore(); + }); + + it('should return report when all tests complete in one batch', async () => { + const tests = [ + makeTestResult({ id: 'test-1', passed: true }), + makeTestResult({ id: 'test-2', passed: true }), + ]; + + mockedGetFinishedTests.mockResolvedValueOnce(tests); + mockedGetFinishedAsserts.mockResolvedValueOnce([]); + + const promise = observe('run-1', ['test-1', 'test-2']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result).toEqual({ + testsAmount: 2, + passedTestsAmount: 2, + failedTests: [], + epistemicTests: [], + missedTestsAmount: 0, + }); + }); + + it('should handle tests completing in multiple batches', async () => { + mockedGetFinishedTests + .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })]) + .mockResolvedValueOnce([makeTestResult({ id: 'test-2' })]); + mockedGetFinishedAsserts.mockResolvedValueOnce([]); + + const promise = observe('run-1', ['test-1', 'test-2']); + await jest.advanceTimersByTimeAsync(200); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.testsAmount).toBe(2); + expect(result.passedTestsAmount).toBe(2); + expect(result.failedTests).toEqual([]); + expect(result.missedTestsAmount).toBe(0); + expect(consoleSpy).toHaveBeenCalledTimes(2); + }); + + it('should report failed tests with their asserts attached', async () => { + const failedTest = makeTestResult({ id: 'test-1', passed: false }); + const passingTest = makeTestResult({ id: 'test-2', passed: true }); + const failedAssert = makeAssertResult({ + id: 'assert-1', + test_id: 'test-1', + passed: false, + score: 0, + reason: 'failed criteria', + }); + + mockedGetFinishedTests.mockResolvedValueOnce([failedTest, passingTest]); + mockedGetFinishedAsserts.mockResolvedValueOnce([failedAssert]); + + const promise = observe('run-1', ['test-1', 'test-2']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.testsAmount).toBe(2); + expect(result.passedTestsAmount).toBe(1); + expect(result.failedTests).toHaveLength(1); + expect(result.failedTests[0].id).toBe('test-1'); + expect(result.failedTests[0].asserts).toEqual([failedAssert]); + }); + + it('should calculate epistemic test metrics', async () => { + const test = makeTestResult({ id: 'test-1', passed: true }); + + const positivePassedAssert = makeAssertResult({ + id: 'assert-pos-pass', + test_id: 'test-1', + passed: true, + metadata: null, + }); + const positiveFailedAssert = makeAssertResult({ + id: 'assert-pos-fail', + test_id: 'test-1', + passed: false, + metadata: null, + }); + const negativeFailedAssert = makeAssertResult({ + id: 'assert-neg-fail', + test_id: 'test-1', + passed: false, + metadata: { must_fail: true }, + }); + const negativePassedAssert = makeAssertResult({ + id: 'assert-neg-pass', + test_id: 'test-1', + passed: true, + metadata: { must_fail: true }, + }); + + mockedGetFinishedTests.mockResolvedValueOnce([test]); + mockedGetFinishedAsserts.mockResolvedValueOnce([ + positivePassedAssert, + positiveFailedAssert, + negativeFailedAssert, + negativePassedAssert, + ]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.epistemicTests).toHaveLength(1); + // positiveAsserts: 2, positivePassedAsserts: 1 → ratio 0.5 + // negativeAsserts: 2, negativeFailedAsserts: 1 → ratio 0.5 + // honesty = |1 - 0.5 - 0.5| = 0, deviation = 1 + expect(result.epistemicTests[0].honesty).toBe(0); + expect(result.epistemicTests[0].deviation).toBe(1); + }); + + it('should calculate perfect epistemic scores', async () => { + const test = makeTestResult({ id: 'test-1', passed: true }); + + const positiveAssert = makeAssertResult({ + id: 'assert-pos', + test_id: 'test-1', + passed: true, + metadata: null, + }); + const negativeAssert = makeAssertResult({ + id: 'assert-neg', + test_id: 'test-1', + passed: false, + metadata: { must_fail: true }, + }); + + mockedGetFinishedTests.mockResolvedValueOnce([test]); + mockedGetFinishedAsserts.mockResolvedValueOnce([positiveAssert, negativeAssert]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + // positivePassedAsserts/positiveAsserts = 1/1, negativeFailedAsserts/negativeAsserts = 1/1 + // honesty = |1 - 1 - 1| = 1, deviation = 0 + expect(result.epistemicTests[0].honesty).toBe(1); + expect(result.epistemicTests[0].deviation).toBe(0); + }); + + it('should skip epistemic entries when test is not found in results', async () => { + const test = makeTestResult({ id: 'test-1', passed: true }); + + const assertForMissingTest = makeAssertResult({ + id: 'assert-1', + test_id: 'test-nonexistent', + metadata: { must_fail: true }, + }); + + mockedGetFinishedTests.mockResolvedValueOnce([test]); + mockedGetFinishedAsserts.mockResolvedValueOnce([assertForMissingTest]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.epistemicTests).toEqual([]); + }); + + it('should correctly filter failed asserts using xnor logic', async () => { + const test = makeTestResult({ id: 'test-1', passed: false }); + + // Regular assert that failed → included (unexpected outcome) + const regularFailed = makeAssertResult({ + id: 'a1', + test_id: 'test-1', + passed: false, + metadata: null, + }); + // Regular assert that passed → excluded (expected outcome) + const regularPassed = makeAssertResult({ + id: 'a2', + test_id: 'test-1', + passed: true, + metadata: null, + }); + // must_fail assert that passed → included (unexpected outcome) + const mustFailPassed = makeAssertResult({ + id: 'a3', + test_id: 'test-1', + passed: true, + metadata: { must_fail: true }, + }); + // must_fail assert that failed → excluded (expected outcome) + const mustFailFailed = makeAssertResult({ + id: 'a4', + test_id: 'test-1', + passed: false, + metadata: { must_fail: true }, + }); + + mockedGetFinishedTests.mockResolvedValueOnce([test]); + mockedGetFinishedAsserts.mockResolvedValueOnce([ + regularFailed, regularPassed, mustFailPassed, mustFailFailed, + ]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.failedTests).toHaveLength(1); + const attachedAsserts = result.failedTests[0].asserts!; + expect(attachedAsserts).toHaveLength(2); + expect(attachedAsserts.map(a => a.id).sort()).toEqual(['a1', 'a3']); + }); + + it('should handle idle iterations before tests complete', async () => { + mockedGetFinishedTests + .mockResolvedValueOnce([]) + .mockResolvedValueOnce([]) + .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })]); + mockedGetFinishedAsserts.mockResolvedValueOnce([]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + await jest.advanceTimersByTimeAsync(200); + await jest.advanceTimersByTimeAsync(200); + const result = await promise; + + expect(result.testsAmount).toBe(1); + expect(result.passedTestsAmount).toBe(1); + expect(result.missedTestsAmount).toBe(0); + }); + + it('should log progress for each completed batch', async () => { + mockedGetFinishedTests + .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })]) + .mockResolvedValueOnce([makeTestResult({ id: 'test-2' })]); + mockedGetFinishedAsserts.mockResolvedValueOnce([]); + + const promise = observe('run-1', ['test-1', 'test-2']); + await jest.advanceTimersByTimeAsync(200); + await jest.advanceTimersByTimeAsync(200); + await promise; + + expect(consoleSpy).toHaveBeenCalledTimes(2); + }); + + it('should pass runId and testIds to db functions', async () => { + mockedGetFinishedTests.mockResolvedValueOnce([ + makeTestResult({ id: 'test-1' }), + ]); + mockedGetFinishedAsserts.mockResolvedValueOnce([]); + + const promise = observe('run-123', ['test-1']); + await jest.advanceTimersByTimeAsync(200); + await promise; + + expect(mockedGetFinishedTests).toHaveBeenCalledWith('run-123', ['test-1']); + expect(mockedGetFinishedAsserts).toHaveBeenCalledWith('run-123', ['test-1']); + }); + + // NOTE: This test must run last because it leaves the module-level + // idleCounter at MAX_IDLE_ITERATIONS, which persists across tests. + it('should break on idle timeout and report missed tests', async () => { + mockedGetFinishedTests.mockResolvedValue([]); + mockedGetFinishedAsserts.mockResolvedValue([]); + + const promise = observe('run-1', ['test-1']); + await jest.advanceTimersByTimeAsync(60_000); + const result = await promise; + + expect(result.testsAmount).toBe(1); + expect(result.passedTestsAmount).toBe(1); + expect(result.failedTests).toEqual([]); + expect(result.missedTestsAmount).toBe(1); + }); + }); +});