From a0a029e5d3298f9a3c11be14ef0f7801e23b70f8 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 20:35:38 -0400 Subject: [PATCH 1/3] feat(code_gen): add per-language subset metrics for livecodebench-x Signed-off-by: Rod Boev --- resources_servers/code_gen/app.py | 6 ++++++ resources_servers/code_gen/tests/test_app.py | 21 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/resources_servers/code_gen/app.py b/resources_servers/code_gen/app.py index d6787203db..b5d372a29f 100644 --- a/resources_servers/code_gen/app.py +++ b/resources_servers/code_gen/app.py @@ -74,6 +74,7 @@ class CompCodingVerifyResponse(BaseVerifyResponse): unit_tests_time_taken: Optional[float] = None reasoning_format_violation_rate: float = 0.0 difficulty: Optional[str] = None + target_language: Optional[str] = None # ---------------------------- @@ -124,6 +125,7 @@ def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]: ) add_avg_sample_std_dev(metrics, all_score_dicts, score_names, max_k) metrics.update(compute_subset_metrics(tasks, "difficulty", self._code_score_fn, "extracted_model_code")) + metrics.update(compute_subset_metrics(tasks, "target_language", self._code_score_fn, "extracted_model_code")) return metrics def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: @@ -147,6 +149,7 @@ def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyResponse: model_out = body.response.output_text difficulty = (body.verifier_metadata or {}).get("difficulty") + target_language = (body.verifier_metadata or {}).get("target_language") if not model_out or not model_out.strip(): # A response existed but had no usable text -> model failure @@ -154,6 +157,7 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons **body.model_dump(), reward=0.0, difficulty=difficulty, + target_language=target_language, ) tests = UnitTests.model_validate(body.verifier_metadata["unit_tests"]) @@ -166,6 +170,7 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons reward=0.0, extracted_model_output=model_out, difficulty=difficulty, + target_language=target_language, ) # 4) run (no sandbox) @@ -224,6 +229,7 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons unit_tests_time_taken=unit_tests_time_taken, reasoning_format_violation_rate=1.0 if has_violation else 0.0, difficulty=difficulty, + target_language=target_language, ) diff --git a/resources_servers/code_gen/tests/test_app.py b/resources_servers/code_gen/tests/test_app.py index e39c287eda..5c601911bc 100644 --- a/resources_servers/code_gen/tests/test_app.py +++ b/resources_servers/code_gen/tests/test_app.py @@ -146,6 +146,10 @@ def test_verify_missing_response_validation_error(self) -> None: verifier_metadata={"unit_tests": {"inputs": ["1\n"], "outputs": ["1"]}}, ) + def test_verify_target_language_field(self) -> None: + resp = CompCodingVerifyResponse(reward=0.0, target_language="ja") + assert resp.target_language == "ja" + async def test_verify_no_code_block(self, code_gen_resources_server_client: TestClient) -> None: """Test when response contains no code block - should extract raw text""" response = NeMoGymResponse( @@ -327,6 +331,23 @@ def test_no_answer_tracked(self) -> None: assert "pass@1[avg-of-2]/no_answer" in m assert m["pass@1[avg-of-2]/no_answer"] == pytest.approx(50.0) + def test_produces_per_language_subsets(self): + server = _make_server() + tasks = [ + [ + {"reward": 1.0, "extracted_model_code": "print(1)", "target_language": "de"}, + {"reward": 1.0, "extracted_model_code": "print(1)", "target_language": "de"}, + ], + [ + {"reward": 0.0, "extracted_model_code": "print(2)", "target_language": "fr"}, + {"reward": 0.0, "extracted_model_code": "print(2)", "target_language": "fr"}, + ], + ] + m = server.compute_metrics(tasks) + assert "de/pass@1/accuracy" in m + assert "fr/pass@1/accuracy" in m + assert m["de/pass@1/accuracy"] > m["fr/pass@1/accuracy"] + class TestGetKeyMetrics: @pytest.mark.asyncio From bbeca8d6b1b29bf12fb347e7111ee92fe6529151 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 20:50:55 -0400 Subject: [PATCH 2/3] fix(code_gen): declare target_language on verify request and extract correctly Signed-off-by: Rod Boev --- resources_servers/code_gen/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources_servers/code_gen/app.py b/resources_servers/code_gen/app.py index b5d372a29f..2001f6ecab 100644 --- a/resources_servers/code_gen/app.py +++ b/resources_servers/code_gen/app.py @@ -64,6 +64,7 @@ class CompCodingRunRequest(BaseRunRequest): class CompCodingVerifyRequest(CompCodingRunRequest, BaseVerifyRequest): verifier_metadata: Optional[Dict[str, Any]] = None + target_language: Optional[str] = None class CompCodingVerifyResponse(BaseVerifyResponse): @@ -149,7 +150,7 @@ def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyResponse: model_out = body.response.output_text difficulty = (body.verifier_metadata or {}).get("difficulty") - target_language = (body.verifier_metadata or {}).get("target_language") + target_language = body.target_language if not model_out or not model_out.strip(): # A response existed but had no usable text -> model failure From cd3ef8ed0addffcef66d6e5b762425ce5f354a16 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 21:00:04 -0400 Subject: [PATCH 3/3] fix(code_gen): drop redundant target_language kwarg shadowed by model_dump Signed-off-by: Rod Boev --- resources_servers/code_gen/app.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/resources_servers/code_gen/app.py b/resources_servers/code_gen/app.py index 2001f6ecab..317da60e30 100644 --- a/resources_servers/code_gen/app.py +++ b/resources_servers/code_gen/app.py @@ -150,7 +150,6 @@ def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyResponse: model_out = body.response.output_text difficulty = (body.verifier_metadata or {}).get("difficulty") - target_language = body.target_language if not model_out or not model_out.strip(): # A response existed but had no usable text -> model failure @@ -158,7 +157,6 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons **body.model_dump(), reward=0.0, difficulty=difficulty, - target_language=target_language, ) tests = UnitTests.model_validate(body.verifier_metadata["unit_tests"]) @@ -171,7 +169,6 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons reward=0.0, extracted_model_output=model_out, difficulty=difficulty, - target_language=target_language, ) # 4) run (no sandbox) @@ -230,7 +227,6 @@ async def verify(self, body: CompCodingVerifyRequest) -> CompCodingVerifyRespons unit_tests_time_taken=unit_tests_time_taken, reasoning_format_violation_rate=1.0 if has_violation else 0.0, difficulty=difficulty, - target_language=target_language, )