diff --git a/resources_servers/code_gen/app.py b/resources_servers/code_gen/app.py index d6787203db..317da60e30 100644 --- a/resources_servers/code_gen/app.py +++ b/resources_servers/code_gen/app.py @@ -64,6 +64,7 @@ class CompCodingRunRequest(BaseRunRequest): class CompCodingVerifyRequest(CompCodingRunRequest, BaseVerifyRequest): verifier_metadata: Optional[Dict[str, Any]] = None + target_language: Optional[str] = None class CompCodingVerifyResponse(BaseVerifyResponse): @@ -74,6 +75,7 @@ class CompCodingVerifyResponse(BaseVerifyResponse): unit_tests_time_taken: Optional[float] = None reasoning_format_violation_rate: float = 0.0 difficulty: Optional[str] = None + target_language: Optional[str] = None # ---------------------------- @@ -124,6 +126,7 @@ def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]: ) add_avg_sample_std_dev(metrics, all_score_dicts, score_names, max_k) metrics.update(compute_subset_metrics(tasks, "difficulty", self._code_score_fn, "extracted_model_code")) + metrics.update(compute_subset_metrics(tasks, "target_language", self._code_score_fn, "extracted_model_code")) return metrics def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: diff --git a/resources_servers/code_gen/tests/test_app.py b/resources_servers/code_gen/tests/test_app.py index e39c287eda..5c601911bc 100644 --- a/resources_servers/code_gen/tests/test_app.py +++ b/resources_servers/code_gen/tests/test_app.py @@ -146,6 +146,10 @@ def test_verify_missing_response_validation_error(self) -> None: verifier_metadata={"unit_tests": {"inputs": ["1\n"], "outputs": ["1"]}}, ) + def test_verify_target_language_field(self) -> None: + resp = CompCodingVerifyResponse(reward=0.0, target_language="ja") + assert resp.target_language == "ja" + async def test_verify_no_code_block(self, code_gen_resources_server_client: TestClient) -> None: """Test when response contains no code block - should extract raw text""" response = NeMoGymResponse( @@ -327,6 +331,23 @@ def test_no_answer_tracked(self) -> None: assert "pass@1[avg-of-2]/no_answer" in m assert m["pass@1[avg-of-2]/no_answer"] == pytest.approx(50.0) + def test_produces_per_language_subsets(self): + server = _make_server() + tasks = [ + [ + {"reward": 1.0, "extracted_model_code": "print(1)", "target_language": "de"}, + {"reward": 1.0, "extracted_model_code": "print(1)", "target_language": "de"}, + ], + [ + {"reward": 0.0, "extracted_model_code": "print(2)", "target_language": "fr"}, + {"reward": 0.0, "extracted_model_code": "print(2)", "target_language": "fr"}, + ], + ] + m = server.compute_metrics(tasks) + assert "de/pass@1/accuracy" in m + assert "fr/pass@1/accuracy" in m + assert m["de/pass@1/accuracy"] > m["fr/pass@1/accuracy"] + class TestGetKeyMetrics: @pytest.mark.asyncio