Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions py/autoevals/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,7 +983,7 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None,

statements = (
await aextract_statements(
client=self.client, question=input, answer=expected, model=self.model, **self.extra_args
client=self.client, question=input, answer=output, model=self.model, **self.extra_args
)
)["statements"]

Expand All @@ -1003,12 +1003,10 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None,
)

def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwargs):
check_required("Faithfulness", input=input, context=context)
check_required("Faithfulness", input=input, output=output, context=context)

statements = (
extract_statements(
client=self.client, question=input, answer=expected, model=self.model, **self.extra_args
)
extract_statements(client=self.client, question=input, answer=output, model=self.model, **self.extra_args)
)["statements"]

faithfulness = (
Expand Down
46 changes: 46 additions & 0 deletions py/autoevals/test_ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from httpx import Response
from openai import OpenAI

import autoevals.ragas as ragas_module
from autoevals import init
from autoevals.ragas import *

Expand Down Expand Up @@ -125,6 +126,51 @@ def test_context_relevancy_score_normal_case():
assert result.score >= 0.0


def test_faithfulness_extracts_statements_from_output(monkeypatch):
"""Regression test for Faithfulness answer routing.

This verifies that Faithfulness extracts statements from ``output`` (the model
answer being evaluated), not from ``expected`` (ground truth). The test uses
mismatched ``output``/``expected`` values and mocked helpers that derive
statements/verdicts from their inputs so the final score depends on which
field was routed into statement extraction.
"""
captured_answer = None

def fake_extract_statements(question, answer, client=None, **extra_args):
nonlocal captured_answer
captured_answer = answer
statement = answer.strip().rstrip(".")
return {"statements": [statement]}

def fake_extract_faithfulness(context, statements, client=None, **extra_args):
faithfulness = []
for statement in statements:
verdict = int(statement in context)
faithfulness.append(
{
"statement": statement,
"verdict": verdict,
"reason": "Supported by context" if verdict else "Not found in context",
}
)
return {"faithfulness": faithfulness}

monkeypatch.setattr(ragas_module, "extract_statements", fake_extract_statements)
monkeypatch.setattr(ragas_module, "extract_faithfulness", fake_extract_faithfulness)

scorer = Faithfulness()
score = scorer.eval(
input="What is the capital of France?",
output="Paris is the capital of France.",
expected="Lyon is the capital of France.",
context="Paris is the capital of France.",
)

assert score.score == 1
assert captured_answer == "Paris is the capital of France."


@respx.mock
def test_answer_correctness_uses_custom_embedding_model():
"""Test that AnswerCorrectness passes embedding_model parameter through to embeddings API."""
Expand Down
Loading