diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index 53a7ddc..3f120df 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -1,12 +1,15 @@
-name: Deploy Docusaurus
+name: CI/CD Pipeline
 
 on:
   push:
     branches:
       - main
+  pull_request:
+    branches:
+      - main
 
 jobs:
-  build:
+  test-and-deploy:
     runs-on: ubuntu-latest
     permissions:
       contents: write
@@ -22,17 +25,23 @@ jobs:
         with:
           node-version: 22
           cache: 'pnpm'
-          cache-dependency-path: docs/pnpm-lock.yaml
 
-      - name: Install dependencies
+      - name: Install Root Dependencies
+        run: pnpm install --frozen-lockfile
+
+      - name: Run Eva-Judge Tests
+        run: pnpm run test:coverage
+
+      - name: Install Docs dependencies
         run: pnpm install --frozen-lockfile
         working-directory: docs
 
-      - name: Build
+      - name: Build Docs
         run: pnpm run build
         working-directory: docs
 
-      - name: Deploy
+      - name: Deploy to GH Pages
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/tests/dark_teaming.yaml b/examples/dark_teaming.yaml
similarity index 100%
rename from tests/dark_teaming.yaml
rename to examples/dark_teaming.yaml
diff --git a/tests/promptfooconfig.yaml b/examples/promptfooconfig.yaml
similarity index 100%
rename from tests/promptfooconfig.yaml
rename to examples/promptfooconfig.yaml
diff --git a/tests/temperature_sensitivy.yaml b/examples/temperature_sensitivy.yaml
similarity index 100%
rename from tests/temperature_sensitivy.yaml
rename to examples/temperature_sensitivy.yaml
diff --git a/tests/temperature_sensitivy_rus.yaml b/examples/temperature_sensitivy_rus.yaml
similarity index 100%
rename from tests/temperature_sensitivy_rus.yaml
rename to examples/temperature_sensitivy_rus.yaml
diff --git a/jest.config.js b/jest.config.js
new file mode 100644
index 0000000..4463721
--- /dev/null
+++ b/jest.config.js
@@ -0,0 +1,41 @@
+const { createDefaultPreset } = require('ts-jest');
+
+const tsJestTransformCfg = createDefaultPreset().transform;
+
+/** @type {import("jest").Config} **/
+module.exports = {
+  testEnvironment: 'node',
+  transform: {
+    '^.+\\.tsx?$': ['ts-jest', {
+      tsconfig: {
+        allowSyntheticDefaultImports: true,
+      },
+    }],
+  },
+  moduleNameMapper: {
+    '^utils$': '<rootDir>/src/utils',
+    '^schemas$': '<rootDir>/src/schemas',
+  },
+  testPathIgnorePatterns: [
+    "/node_modules/",
+    "/src/",
+    "/dst/",
+  ],
+  coverageDirectory: "coverage",
+  coverageProvider: "v8",
+  collectCoverageFrom: [
+    "src/**/*.{ts,js}",
+    "!src/**/*.d.ts",
+    "!src/types/**",
+    "!**/node_modules/**"
+  ],
+  coverageReporters: ["text", "lcov", "clover"],
+  coverageThreshold: {
+    global: {
+      branches: 95,
+      functions: 90,
+      lines: 85,
+      statements: 85,
+    },
+  },
+};
diff --git a/package.json b/package.json
index e84e9c5..e9ffc5f 100644
--- a/package.json
+++ b/package.json
@@ -17,7 +17,8 @@
     "bin": "node ./dst/index.js",
     "build": "tsc",
     "prepare": "pnpm run build",
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "test": "jest",
+    "test:coverage": "jest --coverage"
   },
   "repository": {
     "type": "git",
diff --git a/tests/db.test.ts b/tests/db.test.ts
new file mode 100644
index 0000000..7796643
--- /dev/null
+++ b/tests/db.test.ts
@@ -0,0 +1,162 @@
+import type { ITestResult, IAssertResult } from '../src/types';
+
+const mockSqlTagged = jest.fn();
+const mockSqlIdentifier = jest.fn((...args: any[]) => args[0]);
+
+const mockSql = Object.assign(mockSqlTagged, {
+  call: mockSqlTagged,
+});
+
+// Make `sql('TableName')` work for identifiers and `sql(array)` for IN lists
+// while `sql`...`` (tagged template) calls mockSqlTagged
+const sqlProxy = new Proxy(mockSql, {
+  apply(_target, _thisArg, argArray) {
+    // Tagged template: first arg is TemplateStringsArray
+    if (Array.isArray(argArray[0]) && 'raw' in argArray[0]) {
+      return mockSqlTagged(...argArray);
+    }
+    // Regular call: sql('TableName') or sql(array)
+    return mockSqlIdentifier(...argArray);
+  },
+});
+
+jest.mock('postgres', () => ({
+  __esModule: true,
+  default: jest.fn(() => sqlProxy),
+}));
+
+// Must import after mock setup
+import { getFinishedTests, getFinishedAsserts } from '../src/db';
+
+const FAKE_TEST_RESULTS: ITestResult[] = [
+  {
+    id: 'test-1',
+    run_id: 'run-1',
+    provider: 'openai',
+    model: 'gpt-4',
+    prompt: 'hello',
+    output: 'world',
+    passed: true,
+    metadata: null,
+    started_at: new Date('2025-01-01'),
+    assert_started_at: new Date('2025-01-01'),
+    finished_at: new Date('2025-01-01'),
+    diff_ms: 100,
+    assert_diff_ms: 50,
+    output_diff_ms: 50,
+  },
+];
+
+const FAKE_ASSERT_RESULTS: IAssertResult[] = [
+  {
+    id: 'assert-1',
+    test_id: 'test-1',
+    run_id: 'run-1',
+    name: 'accuracy',
+    criteria: 'must be accurate',
+    passed: true,
+    score: 0.9,
+    reason: 'looks good',
+    threshold: 0.5,
+    metadata: null,
+    started_at: new Date('2025-01-01'),
+    finished_at: new Date('2025-01-01'),
+    diff_ms: 30,
+  },
+];
+
+describe('db', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+
+  describe('getFinishedTests', () => {
+    it('should query TestResult table with run_id and test ids', async () => {
+      mockSqlTagged.mockResolvedValueOnce(FAKE_TEST_RESULTS);
+
+      const runId = 'run-1';
+      const testIds = ['test-1', 'test-2'];
+      const result = await getFinishedTests(runId, testIds);
+
+      expect(result).toEqual(FAKE_TEST_RESULTS);
+      expect(mockSqlTagged).toHaveBeenCalledTimes(1);
+
+      // Verify template strings contain the expected SQL fragments
+      const templateStrings = mockSqlTagged.mock.calls[0][0];
+      const joined = templateStrings.join('??');
+      expect(joined).toContain('SELECT');
+      expect(joined).toContain('FROM');
+      expect(joined).toContain('WHERE');
+      expect(joined).toContain('run_id');
+      expect(joined).toContain('IN');
+
+      // Verify interpolated values include the table identifier, runId, and testIds
+      const interpolatedValues = mockSqlTagged.mock.calls[0].slice(1);
+      expect(interpolatedValues).toContain(runId);
+    });
+
+    it('should pass the table name through sql identifier helper', async () => {
+      mockSqlTagged.mockResolvedValueOnce([]);
+
+      await getFinishedTests('run-1', ['test-1']);
+
+      expect(mockSqlIdentifier).toHaveBeenCalledWith('TestResult');
+    });
+
+    it('should pass testIds through sql helper for IN clause', async () => {
+      mockSqlTagged.mockResolvedValueOnce([]);
+      const testIds = ['t-1', 't-2', 't-3'];
+
+      await getFinishedTests('run-1', testIds);
+
+      expect(mockSqlIdentifier).toHaveBeenCalledWith(testIds);
+    });
+  });
+
+  describe('getFinishedAsserts', () => {
+    it('should query AssertResult table with run_id and test ids', async () => {
+      mockSqlTagged.mockResolvedValueOnce(FAKE_ASSERT_RESULTS);
+
+      const runId = 'run-1';
+      const testIds = ['test-1'];
+      const result = await getFinishedAsserts(runId, testIds);
+
+      expect(result).toEqual(FAKE_ASSERT_RESULTS);
+      expect(mockSqlTagged).toHaveBeenCalledTimes(1);
+
+      const templateStrings = mockSqlTagged.mock.calls[0][0];
+      const joined = templateStrings.join('??');
+      expect(joined).toContain('SELECT');
+      expect(joined).toContain('FROM');
+      expect(joined).toContain('WHERE');
+      expect(joined).toContain('run_id');
+      expect(joined).toContain('test_id');
+      expect(joined).toContain('IN');
+    });
+
+    it('should pass the table name through sql identifier helper', async () => {
+      mockSqlTagged.mockResolvedValueOnce([]);
+
+      await getFinishedAsserts('run-1', ['test-1']);
+
+      expect(mockSqlIdentifier).toHaveBeenCalledWith('AssertResult');
+    });
+
+    it('should pass testIds through sql helper for IN clause', async () => {
+      mockSqlTagged.mockResolvedValueOnce([]);
+      const testIds = ['t-1', 't-2'];
+
+      await getFinishedAsserts('run-1', testIds);
+
+      expect(mockSqlIdentifier).toHaveBeenCalledWith(testIds);
+    });
+
+    it('should return empty array when no results', async () => {
+      mockSqlTagged.mockResolvedValueOnce([]);
+
+      const result = await getFinishedAsserts('run-x', ['t-none']);
+
+      expect(result).toEqual([]);
+    });
+  });
+});
diff --git a/tests/index.test.ts b/tests/index.test.ts
new file mode 100644
index 0000000..d69a5ba
--- /dev/null
+++ b/tests/index.test.ts
@@ -0,0 +1,486 @@
+import type { ITestResult, TReport, IEpistemicReport, IAssertResult } from '../src/types';
+
+let capturedAction: (suite?: string) => Promise<void>;
+
+const mockCommand: Record<string, jest.Mock> = {
+  name: jest.fn().mockReturnThis(),
+  version: jest.fn().mockReturnThis(),
+  description: jest.fn().mockReturnThis(),
+  command: jest.fn().mockReturnThis(),
+  argument: jest.fn().mockReturnThis(),
+  action: jest.fn((fn: any) => {
+    capturedAction = fn;
+    return mockCommand;
+  }),
+  parse: jest.fn(),
+};
+
+jest.mock('commander', () => ({
+  Command: jest.fn(() => mockCommand),
+}));
+
+const mockIntro = jest.fn();
+const mockText = jest.fn();
+const mockIsCancel = jest.fn<boolean, [unknown]>(() => false);
+const mockSpinner = jest.fn(() => ({ start: jest.fn(), stop: jest.fn() }));
+const mockCancel = jest.fn();
+const mockOutro = jest.fn();
+
+jest.mock('@clack/prompts', () => ({
+  intro: (...args: any[]) => mockIntro(...args),
+  text: (...args: any[]) => mockText(...args),
+  isCancel: (value: unknown) => mockIsCancel(value),
+  spinner: () => mockSpinner(),
+  cancel: (...args: any[]) => mockCancel(...args),
+  outro: (...args: any[]) => mockOutro(...args),
+}));
+
+jest.mock('picocolors', () => ({
+  __esModule: true,
+  default: {
+    bgCyan: (s: string) => s,
+    black: (s: string) => s,
+    yellow: (s: string) => s,
+    red: (s: string) => s,
+    green: (s: string) => s,
+    blue: (s: string) => s,
+    cyan: (s: string) => s,
+    magenta: (s: string) => s,
+    bold: (s: string) => s,
+  },
+}));
+
+const mockParsePromptfoo = jest.fn<any, [string]>(() => [{ task: 'parsed' }]);
+jest.mock('@eva-llm/eva-parser', () => ({
+  parsePromptfoo: (content: string) => mockParsePromptfoo(content),
+}));
+
+const mockReadFileSync = jest.fn<string, [string, string]>(() => 'file content');
+jest.mock('node:fs', () => ({
+  readFileSync: (path: string, encoding: string) => mockReadFileSync(path, encoding),
+}));
+
+const mockUuidv7 = jest.fn(() => 'mock-uuid-v7');
+jest.mock('uuidv7', () => ({
+  uuidv7: () => mockUuidv7(),
+}));
+
+const mockRequest = jest.fn();
+jest.mock('undici', () => ({
+  request: (...args: any[]) => mockRequest(...args),
+}));
+
+const mockObserve = jest.fn();
+jest.mock('../src/utils', () => ({
+  observe: (...args: any[]) => mockObserve(...args),
+}));
+
+// Load the module — registers the commander action via the mock
+require('../src/index');
+
+const makeReport = (overrides: Partial<TReport> = {}): TReport => ({
+  testsAmount: 2,
+  passedTestsAmount: 2,
+  failedTests: [],
+  epistemicTests: [],
+  missedTestsAmount: 0,
+  ...overrides,
+});
+
+const makeTestResult = (overrides: Partial<ITestResult> = {}): ITestResult => ({
+  id: 'test-1',
+  run_id: 'run-1',
+  provider: 'openai',
+  model: 'gpt-4',
+  prompt: 'test prompt',
+  output: 'test output',
+  passed: false,
+  metadata: null,
+  started_at: new Date('2025-01-01'),
+  assert_started_at: new Date('2025-01-01'),
+  finished_at: new Date('2025-01-01'),
+  diff_ms: 100,
+  assert_diff_ms: 50,
+  output_diff_ms: 50,
+  ...overrides,
+});
+
+const makeAssertResult = (overrides: Partial<IAssertResult> = {}): IAssertResult => ({
+  id: 'assert-1',
+  test_id: 'test-1',
+  run_id: 'run-1',
+  name: 'test-assert',
+  criteria: 'test criteria',
+  passed: false,
+  score: 0,
+  reason: 'it failed',
+  threshold: 0.5,
+  metadata: null,
+  started_at: new Date('2025-01-01'),
+  finished_at: new Date('2025-01-01'),
+  diff_ms: 50,
+  ...overrides,
+});
+
+describe('eva-cli index module', () => {
+  let consoleSpy: jest.SpiedFunction<typeof console.log>;
+  let exitSpy: jest.SpiedFunction<typeof process.exit>;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    consoleSpy = jest.spyOn(console, 'log').mockImplementation();
+    exitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => {
+      throw new Error(`process.exit(${code})`);
+    }) as any);
+  });
+
+  afterEach(() => {
+    consoleSpy.mockRestore();
+    exitSpy.mockRestore();
+  });
+
+  describe('run command', () => {
+    const setupMocks = (report: TReport = makeReport()) => {
+      mockRequest.mockResolvedValue({
+        statusCode: 200,
+        body: {
+          json: () => Promise.resolve({ test_ids: ['tid-1', 'tid-2'] }),
+          text: () => Promise.resolve(''),
+        },
+      });
+      mockObserve.mockResolvedValue(report);
+    };
+
+    it('should read and parse the suite file', async () => {
+      setupMocks();
+
+      await expect(capturedAction('my-suite.yaml')).rejects.toThrow('process.exit');
+
+      expect(mockReadFileSync).toHaveBeenCalledWith('my-suite.yaml', 'utf-8');
+      expect(mockParsePromptfoo).toHaveBeenCalledWith('file content');
+    });
+
+    it('should submit parsed tasks to the eva-run cluster', async () => {
+      setupMocks();
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit');
+
+      expect(mockRequest).toHaveBeenCalledWith(
+        'http://localhost:3000/eval',
+        expect.objectContaining({
+          method: 'POST',
+          headers: { 'content-type': 'application/json' },
+          body: JSON.stringify([{ run_id: 'mock-uuid-v7', task: 'parsed' }]),
+          bodyTimeout: 0,
+          headersTimeout: 0,
+        }),
+      );
+    });
+
+    it('should observe test results with run_id and test_ids', async () => {
+      setupMocks();
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit');
+
+      expect(mockObserve).toHaveBeenCalledWith('mock-uuid-v7', ['tid-1', 'tid-2']);
+    });
+
+    it('should prompt for suite path when not provided as argument', async () => {
+      setupMocks();
+      mockText.mockResolvedValue('interactive-suite.yaml');
+
+      await expect(capturedAction(undefined)).rejects.toThrow('process.exit');
+
+      expect(mockText).toHaveBeenCalledWith(
+        expect.objectContaining({ message: 'Provide path to the test suite:' }),
+      );
+      expect(mockReadFileSync).toHaveBeenCalledWith('interactive-suite.yaml', 'utf-8');
+    });
+
+    it('should validate that prompt input is not empty', async () => {
+      setupMocks();
+      mockText.mockResolvedValue('suite.yaml');
+
+      await expect(capturedAction(undefined)).rejects.toThrow('process.exit');
+
+      const textConfig = mockText.mock.calls[0][0];
+      expect(textConfig.validate('')).toBe('Please enter a path');
+      expect(textConfig.validate('some-path')).toBeUndefined();
+    });
+
+    it('should exit gracefully when user cancels the prompt', async () => {
+      mockIsCancel.mockReturnValueOnce(true);
+      mockText.mockResolvedValue(Symbol('cancel'));
+
+      await expect(capturedAction(undefined)).rejects.toThrow('process.exit(0)');
+
+      expect(mockCancel).toHaveBeenCalledWith('Operation cancelled.');
+      expect(mockRequest).not.toHaveBeenCalled();
+    });
+
+    it('should throw on non-200 API response', async () => {
+      mockRequest.mockResolvedValue({
+        statusCode: 500,
+        body: {
+          text: () => Promise.resolve('Internal Server Error'),
+        },
+      });
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow(
+        'Server responded with 500: Internal Server Error',
+      );
+    });
+
+    it('should call outro and exit after printing report', async () => {
+      setupMocks();
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit');
+
+      expect(mockOutro).toHaveBeenCalled();
+      expect(exitSpy).toHaveBeenCalledWith(0);
+    });
+
+    it('should log submission and test count messages', async () => {
+      setupMocks();
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit');
+
+      const logs = consoleSpy.mock.calls.map((c) => c[0]);
+      expect(logs).toContain('Submitting to eva-run cluster (localhost:3000)...');
+      expect(logs).toContain('2 test(s) are started...');
+    });
+  });
+
+  describe('printReport', () => {
+    const setupAndRun = async (report: TReport) => {
+      mockRequest.mockResolvedValue({
+        statusCode: 200,
+        body: { json: () => Promise.resolve({ test_ids: ['t1'] }) },
+      });
+      mockObserve.mockResolvedValue(report);
+
+      await expect(capturedAction('suite.yaml')).rejects.toThrow('process.exit');
+    };
+
+    it('should print summary for all-passing report', async () => {
+      await setupAndRun(
+        makeReport({
+          testsAmount: 3,
+          passedTestsAmount: 3,
+          failedTests: [],
+          epistemicTests: [],
+          missedTestsAmount: 0,
+        }),
+      );
+
+      const logs = consoleSpy.mock.calls.map((c) => c[0]).join('\n');
+      expect(logs).toContain('Passed tests: 3');
+      expect(logs).toContain('Failed tests: 0');
+      expect(logs).toContain('Total tests: 3');
+      expect(logs).toContain('Missed tests: 0');
+      expect(logs).not.toContain('Failed test details:');
+    });
+
+    it('should print failed test details with asserts', async () => {
+      const failedTest = makeTestResult({
+        id: 'ft-1',
+        provider: 'anthropic',
+        model: 'claude-3',
+        prompt: 'why?',
+        output: 'because',
+        passed: false,
+        asserts: [
+          makeAssertResult({
+            criteria: 'must be polite',
+            reason: 'was rude',
+            passed: false,
+            score: 0.2,
+            threshold: 0.8,
+          }),
+        ],
+      });
+
+      await setupAndRun(
+        makeReport({
+          testsAmount: 1,
+          passedTestsAmount: 0,
+          failedTests: [failedTest],
+        }),
+      );
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('Failed test details:');
+      expect(logs).toContain('why?');
+      expect(logs).toContain('because');
+      expect(logs).toContain('must be polite');
+      expect(logs).toContain('was rude');
+      expect(logs).toContain('Failed tests: 1');
+    });
+
+    it('should print epistemic test details', async () => {
+      const epistemicTest: IEpistemicReport = {
+        ...makeTestResult({ id: 'et-1', prompt: 'epistemic q', output: 'answer' }),
+        honesty: 0.875,
+        deviation: 0.125,
+      };
+
+      await setupAndRun(
+        makeReport({
+          testsAmount: 1,
+          passedTestsAmount: 1,
+          failedTests: [],
+          epistemicTests: [epistemicTest],
+        }),
+      );
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('Epistemic test details:');
+      expect(logs).toContain('epistemic q');
+      expect(logs).toContain('answer');
+      expect(logs).toContain('0.875');
+      expect(logs).toContain('0.125');
+      expect(logs).toContain('Epistemic tests: 1');
+    });
+
+    it('should print missed tests count', async () => {
+      await setupAndRun(
+        makeReport({
+          testsAmount: 5,
+          passedTestsAmount: 3,
+          missedTestsAmount: 2,
+        }),
+      );
+
+      const logs = consoleSpy.mock.calls.map((c) => c[0]).join('\n');
+      expect(logs).toContain('Missed tests: 2');
+    });
+
+    it('should display temperature in model info for failed tests', async () => {
+      const test = makeTestResult({
+        provider: 'openai',
+        model: 'gpt-4',
+        metadata: { temperature: 0.7 },
+        asserts: [makeAssertResult()],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('openai | gpt-4 | T=0.7');
+    });
+
+    it('should display topP in model info for failed tests', async () => {
+      const test = makeTestResult({
+        provider: 'openai',
+        model: 'gpt-4',
+        metadata: { topP: 0.9 },
+        asserts: [makeAssertResult()],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('openai | gpt-4 | topP=0.9');
+    });
+
+    it('should display topK in model info for failed tests', async () => {
+      const test = makeTestResult({
+        provider: 'openai',
+        model: 'gpt-4',
+        metadata: { topK: 50 },
+        asserts: [makeAssertResult()],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('openai | gpt-4 | topK=50');
+    });
+
+    it('should display provider and model without extras when no relevant metadata', async () => {
+      const test = makeTestResult({
+        provider: 'anthropic',
+        model: 'claude-3',
+        metadata: null,
+        asserts: [makeAssertResult()],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('anthropic | claude-3');
+      expect(logs).not.toMatch(/T=|topP=|topK=/);
+    });
+
+    it('should print must_fail metadata in assert summary', async () => {
+      const test = makeTestResult({
+        asserts: [
+          makeAssertResult({
+            passed: true,
+            score: 0.9,
+            threshold: 0.5,
+            metadata: { must_fail: true },
+          }),
+        ],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('must_fail: true');
+    });
+
+    it('should omit must_fail when assert metadata is null', async () => {
+      const test = makeTestResult({
+        asserts: [makeAssertResult({ metadata: null })],
+      });
+
+      await setupAndRun(makeReport({ testsAmount: 1, passedTestsAmount: 0, failedTests: [test] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).not.toContain('must_fail');
+    });
+
+    it('should display model info for epistemic tests', async () => {
+      const epistemicTest: IEpistemicReport = {
+        ...makeTestResult({
+          provider: 'google',
+          model: 'gemini-pro',
+          metadata: { temperature: 0.3 },
+        }),
+        honesty: 1,
+        deviation: 0,
+      };
+
+      await setupAndRun(makeReport({ epistemicTests: [epistemicTest] }));
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('google | gemini-pro | T=0.3');
+    });
+
+    it('should handle multiple failed tests', async () => {
+      const tests = [
+        makeTestResult({
+          id: 'ft-1',
+          prompt: 'prompt A',
+          asserts: [makeAssertResult({ id: 'a1', criteria: 'criterion A' })],
+        }),
+        makeTestResult({
+          id: 'ft-2',
+          prompt: 'prompt B',
+          asserts: [makeAssertResult({ id: 'a2', criteria: 'criterion B' })],
+        }),
+      ];
+
+      await setupAndRun(
+        makeReport({ testsAmount: 2, passedTestsAmount: 0, failedTests: tests }),
+      );
+
+      const logs = consoleSpy.mock.calls.map((c) => String(c.join(' '))).join('\n');
+      expect(logs).toContain('prompt A');
+      expect(logs).toContain('prompt B');
+      expect(logs).toContain('criterion A');
+      expect(logs).toContain('criterion B');
+    });
+  });
+});
diff --git a/tests/utils.test.ts b/tests/utils.test.ts
new file mode 100644
index 0000000..18d6ce0
--- /dev/null
+++ b/tests/utils.test.ts
@@ -0,0 +1,355 @@
+import { xnor, observe } from '../src/utils';
+import { getFinishedTests, getFinishedAsserts } from '../src/db';
+import type { ITestResult, IAssertResult } from '../src/types';
+
+jest.mock('picocolors', () => ({
+  __esModule: true,
+  default: { yellow: jest.fn((s: string) => s) },
+}));
+jest.mock('../src/db', () => ({
+  getFinishedTests: jest.fn(),
+  getFinishedAsserts: jest.fn(),
+}));
+
+const mockedGetFinishedTests = getFinishedTests as jest.Mock;
+const mockedGetFinishedAsserts = getFinishedAsserts as jest.Mock;
+
+const makeTestResult = (overrides: Partial<ITestResult> = {}): ITestResult => ({
+  id: 'test-1',
+  run_id: 'run-1',
+  provider: 'openai',
+  model: 'gpt-4',
+  prompt: 'test prompt',
+  output: 'test output',
+  passed: true,
+  metadata: null,
+  started_at: new Date('2025-01-01'),
+  assert_started_at: new Date('2025-01-01'),
+  finished_at: new Date('2025-01-01'),
+  diff_ms: 100,
+  assert_diff_ms: 50,
+  output_diff_ms: 50,
+  ...overrides,
+});
+
+const makeAssertResult = (overrides: Partial<IAssertResult> = {}): IAssertResult => ({
+  id: 'assert-1',
+  test_id: 'test-1',
+  run_id: 'run-1',
+  name: 'test-assert',
+  criteria: 'test criteria',
+  passed: true,
+  score: 1,
+  reason: 'passed',
+  threshold: 0.5,
+  metadata: null,
+  started_at: new Date('2025-01-01'),
+  finished_at: new Date('2025-01-01'),
+  diff_ms: 50,
+  ...overrides,
+});
+
+describe('utils module', () => {
+  describe('xnor', () => {
+    it('should return true when both are true', () => {
+      expect(xnor(true, true)).toBe(true);
+    });
+
+    it('should return true when both are false', () => {
+      expect(xnor(false, false)).toBe(true);
+    });
+
+    it('should return false when first is true and second is false', () => {
+      expect(xnor(true, false)).toBe(false);
+    });
+
+    it('should return false when first is false and second is true', () => {
+      expect(xnor(false, true)).toBe(false);
+    });
+  });
+
+  describe('observe', () => {
+    let consoleSpy: jest.SpiedFunction<typeof console.log>;
+
+    beforeEach(() => {
+      jest.useFakeTimers();
+      jest.clearAllMocks();
+      consoleSpy = jest.spyOn(console, 'log').mockImplementation();
+    });
+
+    afterEach(() => {
+      jest.useRealTimers();
+      consoleSpy.mockRestore();
+    });
+
+    it('should return report when all tests complete in one batch', async () => {
+      const tests = [
+        makeTestResult({ id: 'test-1', passed: true }),
+        makeTestResult({ id: 'test-2', passed: true }),
+      ];
+
+      mockedGetFinishedTests.mockResolvedValueOnce(tests);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([]);
+
+      const promise = observe('run-1', ['test-1', 'test-2']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result).toEqual({
+        testsAmount: 2,
+        passedTestsAmount: 2,
+        failedTests: [],
+        epistemicTests: [],
+        missedTestsAmount: 0,
+      });
+    });
+
+    it('should handle tests completing in multiple batches', async () => {
+      mockedGetFinishedTests
+        .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })])
+        .mockResolvedValueOnce([makeTestResult({ id: 'test-2' })]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([]);
+
+      const promise = observe('run-1', ['test-1', 'test-2']);
+      await jest.advanceTimersByTimeAsync(200);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.testsAmount).toBe(2);
+      expect(result.passedTestsAmount).toBe(2);
+      expect(result.failedTests).toEqual([]);
+      expect(result.missedTestsAmount).toBe(0);
+      expect(consoleSpy).toHaveBeenCalledTimes(2);
+    });
+
+    it('should report failed tests with their asserts attached', async () => {
+      const failedTest = makeTestResult({ id: 'test-1', passed: false });
+      const passingTest = makeTestResult({ id: 'test-2', passed: true });
+      const failedAssert = makeAssertResult({
+        id: 'assert-1',
+        test_id: 'test-1',
+        passed: false,
+        score: 0,
+        reason: 'failed criteria',
+      });
+
+      mockedGetFinishedTests.mockResolvedValueOnce([failedTest, passingTest]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([failedAssert]);
+
+      const promise = observe('run-1', ['test-1', 'test-2']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.testsAmount).toBe(2);
+      expect(result.passedTestsAmount).toBe(1);
+      expect(result.failedTests).toHaveLength(1);
+      expect(result.failedTests[0].id).toBe('test-1');
+      expect(result.failedTests[0].asserts).toEqual([failedAssert]);
+    });
+
+    it('should calculate epistemic test metrics', async () => {
+      const test = makeTestResult({ id: 'test-1', passed: true });
+
+      const positivePassedAssert = makeAssertResult({
+        id: 'assert-pos-pass',
+        test_id: 'test-1',
+        passed: true,
+        metadata: null,
+      });
+      const positiveFailedAssert = makeAssertResult({
+        id: 'assert-pos-fail',
+        test_id: 'test-1',
+        passed: false,
+        metadata: null,
+      });
+      const negativeFailedAssert = makeAssertResult({
+        id: 'assert-neg-fail',
+        test_id: 'test-1',
+        passed: false,
+        metadata: { must_fail: true },
+      });
+      const negativePassedAssert = makeAssertResult({
+        id: 'assert-neg-pass',
+        test_id: 'test-1',
+        passed: true,
+        metadata: { must_fail: true },
+      });
+
+      mockedGetFinishedTests.mockResolvedValueOnce([test]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([
+        positivePassedAssert,
+        positiveFailedAssert,
+        negativeFailedAssert,
+        negativePassedAssert,
+      ]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.epistemicTests).toHaveLength(1);
+      // positiveAsserts: 2, positivePassedAsserts: 1 → ratio 0.5
+      // negativeAsserts: 2, negativeFailedAsserts: 1 → ratio 0.5
+      // honesty = |1 - 0.5 - 0.5| = 0, deviation = 1
+      expect(result.epistemicTests[0].honesty).toBe(0);
+      expect(result.epistemicTests[0].deviation).toBe(1);
+    });
+
+    it('should calculate perfect epistemic scores', async () => {
+      const test = makeTestResult({ id: 'test-1', passed: true });
+
+      const positiveAssert = makeAssertResult({
+        id: 'assert-pos',
+        test_id: 'test-1',
+        passed: true,
+        metadata: null,
+      });
+      const negativeAssert = makeAssertResult({
+        id: 'assert-neg',
+        test_id: 'test-1',
+        passed: false,
+        metadata: { must_fail: true },
+      });
+
+      mockedGetFinishedTests.mockResolvedValueOnce([test]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([positiveAssert, negativeAssert]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      // positivePassedAsserts/positiveAsserts = 1/1, negativeFailedAsserts/negativeAsserts = 1/1
+      // honesty = |1 - 1 - 1| = 1, deviation = 0
+      expect(result.epistemicTests[0].honesty).toBe(1);
+      expect(result.epistemicTests[0].deviation).toBe(0);
+    });
+
+    it('should skip epistemic entries when test is not found in results', async () => {
+      const test = makeTestResult({ id: 'test-1', passed: true });
+
+      const assertForMissingTest = makeAssertResult({
+        id: 'assert-1',
+        test_id: 'test-nonexistent',
+        metadata: { must_fail: true },
+      });
+
+      mockedGetFinishedTests.mockResolvedValueOnce([test]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([assertForMissingTest]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.epistemicTests).toEqual([]);
+    });
+
+    it('should correctly filter failed asserts using xnor logic', async () => {
+      const test = makeTestResult({ id: 'test-1', passed: false });
+
+      // Regular assert that failed → included (unexpected outcome)
+      const regularFailed = makeAssertResult({
+        id: 'a1',
+        test_id: 'test-1',
+        passed: false,
+        metadata: null,
+      });
+      // Regular assert that passed → excluded (expected outcome)
+      const regularPassed = makeAssertResult({
+        id: 'a2',
+        test_id: 'test-1',
+        passed: true,
+        metadata: null,
+      });
+      // must_fail assert that passed → included (unexpected outcome)
+      const mustFailPassed = makeAssertResult({
+        id: 'a3',
+        test_id: 'test-1',
+        passed: true,
+        metadata: { must_fail: true },
+      });
+      // must_fail assert that failed → excluded (expected outcome)
+      const mustFailFailed = makeAssertResult({
+        id: 'a4',
+        test_id: 'test-1',
+        passed: false,
+        metadata: { must_fail: true },
+      });
+
+      mockedGetFinishedTests.mockResolvedValueOnce([test]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([
+        regularFailed, regularPassed, mustFailPassed, mustFailFailed,
+      ]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.failedTests).toHaveLength(1);
+      const attachedAsserts = result.failedTests[0].asserts!;
+      expect(attachedAsserts).toHaveLength(2);
+      expect(attachedAsserts.map(a => a.id).sort()).toEqual(['a1', 'a3']);
+    });
+
+    it('should handle idle iterations before tests complete', async () => {
+      mockedGetFinishedTests
+        .mockResolvedValueOnce([])
+        .mockResolvedValueOnce([])
+        .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      await jest.advanceTimersByTimeAsync(200);
+      await jest.advanceTimersByTimeAsync(200);
+      const result = await promise;
+
+      expect(result.testsAmount).toBe(1);
+      expect(result.passedTestsAmount).toBe(1);
+      expect(result.missedTestsAmount).toBe(0);
+    });
+
+    it('should log progress for each completed batch', async () => {
+      mockedGetFinishedTests
+        .mockResolvedValueOnce([makeTestResult({ id: 'test-1' })])
+        .mockResolvedValueOnce([makeTestResult({ id: 'test-2' })]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([]);
+
+      const promise = observe('run-1', ['test-1', 'test-2']);
+      await jest.advanceTimersByTimeAsync(200);
+      await jest.advanceTimersByTimeAsync(200);
+      await promise;
+
+      expect(consoleSpy).toHaveBeenCalledTimes(2);
+    });
+
+    it('should pass runId and testIds to db functions', async () => {
+      mockedGetFinishedTests.mockResolvedValueOnce([
+        makeTestResult({ id: 'test-1' }),
+      ]);
+      mockedGetFinishedAsserts.mockResolvedValueOnce([]);
+
+      const promise = observe('run-123', ['test-1']);
+      await jest.advanceTimersByTimeAsync(200);
+      await promise;
+
+      expect(mockedGetFinishedTests).toHaveBeenCalledWith('run-123', ['test-1']);
+      expect(mockedGetFinishedAsserts).toHaveBeenCalledWith('run-123', ['test-1']);
+    });
+
+    // NOTE: This test must run last because it leaves the module-level
+    // idleCounter at MAX_IDLE_ITERATIONS, which persists across tests.
+    it('should break on idle timeout and report missed tests', async () => {
+      mockedGetFinishedTests.mockResolvedValue([]);
+      mockedGetFinishedAsserts.mockResolvedValue([]);
+
+      const promise = observe('run-1', ['test-1']);
+      await jest.advanceTimersByTimeAsync(60_000);
+      const result = await promise;
+
+      expect(result.testsAmount).toBe(1);
+      expect(result.passedTestsAmount).toBe(1);
+      expect(result.failedTests).toEqual([]);
+      expect(result.missedTestsAmount).toBe(1);
+    });
+  });
+});