diff --git a/py/autoevals/ragas.py b/py/autoevals/ragas.py index 9a6d8a56..47261599 100644 --- a/py/autoevals/ragas.py +++ b/py/autoevals/ragas.py @@ -983,7 +983,7 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, statements = ( await aextract_statements( - client=self.client, question=input, answer=expected, model=self.model, **self.extra_args + client=self.client, question=input, answer=output, model=self.model, **self.extra_args ) )["statements"] @@ -1003,12 +1003,10 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, ) def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwargs): - check_required("Faithfulness", input=input, context=context) + check_required("Faithfulness", input=input, output=output, context=context) statements = ( - extract_statements( - client=self.client, question=input, answer=expected, model=self.model, **self.extra_args - ) + extract_statements(client=self.client, question=input, answer=output, model=self.model, **self.extra_args) )["statements"] faithfulness = ( diff --git a/py/autoevals/test_ragas.py b/py/autoevals/test_ragas.py index 0d0349c0..74db4e57 100644 --- a/py/autoevals/test_ragas.py +++ b/py/autoevals/test_ragas.py @@ -6,6 +6,7 @@ from httpx import Response from openai import OpenAI +import autoevals.ragas as ragas_module from autoevals import init from autoevals.ragas import * @@ -125,6 +126,51 @@ def test_context_relevancy_score_normal_case(): assert result.score >= 0.0 +def test_faithfulness_extracts_statements_from_output(monkeypatch): + """Regression test for Faithfulness answer routing. + + This verifies that Faithfulness extracts statements from ``output`` (the model + answer being evaluated), not from ``expected`` (ground truth). The test uses + mismatched ``output``/``expected`` values and mocked helpers that derive + statements/verdicts from their inputs so the final score depends on which + field was routed into statement extraction. + """ + captured_answer = None + + def fake_extract_statements(question, answer, client=None, **extra_args): + nonlocal captured_answer + captured_answer = answer + statement = answer.strip().rstrip(".") + return {"statements": [statement]} + + def fake_extract_faithfulness(context, statements, client=None, **extra_args): + faithfulness = [] + for statement in statements: + verdict = int(statement in context) + faithfulness.append( + { + "statement": statement, + "verdict": verdict, + "reason": "Supported by context" if verdict else "Not found in context", + } + ) + return {"faithfulness": faithfulness} + + monkeypatch.setattr(ragas_module, "extract_statements", fake_extract_statements) + monkeypatch.setattr(ragas_module, "extract_faithfulness", fake_extract_faithfulness) + + scorer = Faithfulness() + score = scorer.eval( + input="What is the capital of France?", + output="Paris is the capital of France.", + expected="Lyon is the capital of France.", + context="Paris is the capital of France.", + ) + + assert score.score == 1 + assert captured_answer == "Paris is the capital of France." + + @respx.mock def test_answer_correctness_uses_custom_embedding_model(): """Test that AnswerCorrectness passes embedding_model parameter through to embeddings API."""