From 180a4857abdeda921c4363850b4e4b7b6b80684a Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 20:35:41 -0400 Subject: [PATCH 1/4] feat(instruction_following): add loose-mode evaluation and compute_metrics Signed-off-by: Rod Boev --- .../instruction_following/app.py | 81 +++++++++++++++++-- .../instruction_following/tests/test_app.py | 43 ++++++++++ 2 files changed, 118 insertions(+), 6 deletions(-) diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py index feaae8f062..02128e8c70 100644 --- a/resources_servers/instruction_following/app.py +++ b/resources_servers/instruction_following/app.py @@ -12,7 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Literal +import re +import string +from typing import Any, Dict, List, Literal from fastapi import FastAPI from verifiable_instructions import instructions_registry @@ -45,9 +47,43 @@ class InstructionFollowingVerifyRequest(InstructionFollowingRunRequest, BaseVeri pass +def _get_loose_perturbations(text: str) -> list: + """Return the 8 IFEval loose-mode perturbations of a response string.""" + stripped = text.strip() + sentences = re.split(r'(?<=[.?!])\s+', stripped) + first_sentence = sentences[0] if sentences else stripped + paragraphs = re.split(r'\n\s*\n', stripped) + first_paragraph = paragraphs[0] if paragraphs else stripped + no_punct = stripped.translate(str.maketrans('', '', string.punctuation)).strip() + star_match = re.search(r'\*(.*?)\*', text, re.DOTALL) + between_stars = star_match.group(1) if star_match else text + return [ + text, + stripped, + text.lower(), + first_sentence, + first_paragraph, + stripped.lower(), + no_punct, + between_stars, + ] + + +def _check_following_loose(instruction, text: str) -> bool: + """Check instruction against native loose API or 8 perturbations.""" + if hasattr(instruction, 'check_following_loose'): + return instruction.check_following_loose(text) + try: + return instruction.check_following(text, mode='loose') + except TypeError: + return any(instruction.check_following(p) for p in _get_loose_perturbations(text)) + + class InstructionFollowingVerifyResponse(BaseVerifyResponse): follow_all_instructions: bool follow_instruction_list: List[bool] + follow_all_instructions_loose: bool + follow_instruction_list_loose: List[bool] kwargs: List instruction_id_list: List prompt: str @@ -102,6 +138,7 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo instruction_list = body.instruction_id_list kwargs_list = body.kwargs is_following_list = [] + is_following_list_loose = [] for instruction_id, kwargs in zip(instruction_list, kwargs_list): try: @@ -119,16 +156,15 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo # Build the instruction description with the provided kwargs instruction.build_description(**filtered_kwargs) - # Check if the response follows the instruction - if instruction.check_following(final_response_text): - is_following_list.append(True) - else: - is_following_list.append(False) + # Check strict and loose from the same instruction instance + is_following_list.append(instruction.check_following(final_response_text)) + is_following_list_loose.append(_check_following_loose(instruction, final_response_text)) except Exception as e: # If there's an error processing the instruction, mark as failed print(f"Error processing instruction {instruction_id}: {e}") is_following_list.append(False) + is_following_list_loose.append(False) # Calculate overall success reward_mode = getattr(body, "grading_mode", "binary") @@ -144,8 +180,41 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo reward=float(reward), follow_all_instructions=all(is_following_list), follow_instruction_list=is_following_list, + follow_all_instructions_loose=all(is_following_list_loose), + follow_instruction_list_loose=is_following_list_loose, ) + def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]: + """Compute the four IFEval accuracy metrics over all verify responses. + + tasks[i] is the list of rollout dicts for task i. Each dict contains + follow_instruction_list and follow_instruction_list_loose. + """ + prompt_strict: list = [] + instruction_strict: list = [] + prompt_loose: list = [] + instruction_loose: list = [] + + for task_rollouts in tasks: + for rd in task_rollouts: + strict_list = rd.get("follow_instruction_list", []) + loose_list = rd.get("follow_instruction_list_loose", []) + prompt_strict.append(float(all(strict_list)) if strict_list else 0.0) + prompt_loose.append(float(all(loose_list)) if loose_list else 0.0) + instruction_strict.extend(float(v) for v in strict_list) + instruction_loose.extend(float(v) for v in loose_list) + + def _mean(lst: list) -> float: + return sum(lst) / len(lst) if lst else 0.0 + + return { + "prompt_strict_accuracy": _mean(prompt_strict), + "instruction_strict_accuracy": _mean(instruction_strict), + "prompt_loose_accuracy": _mean(prompt_loose), + "instruction_loose_accuracy": _mean(instruction_loose), + } + + if __name__ == "__main__": InstructionFollowingResourcesServer.run_webserver() diff --git a/resources_servers/instruction_following/tests/test_app.py b/resources_servers/instruction_following/tests/test_app.py index 7e4f4590e5..e178161f72 100644 --- a/resources_servers/instruction_following/tests/test_app.py +++ b/resources_servers/instruction_following/tests/test_app.py @@ -202,3 +202,46 @@ def test_fractional_reward_half(self): grading_mode="fraction", ) self._run_verify_test(real_request, False, 0.5, [True, False]) + + def test_loose_fields_present(self): + real_request = self._create_real_request( + instruction_ids=["punctuation:no_comma"], + prompt="The output should not contain any commas.", + kwargs=[{}], + response_content="Hello world without commas", + ) + server = self._create_server() + result = asyncio.run(server.verify(real_request)) + assert isinstance(result.follow_all_instructions_loose, bool) + assert isinstance(result.follow_instruction_list_loose, list) + assert len(result.follow_instruction_list_loose) == 1 + + def test_loose_geq_strict(self): + real_request = self._create_real_request( + instruction_ids=["punctuation:no_comma"], + prompt="No commas please.", + kwargs=[{}], + response_content="Hello, world", + ) + server = self._create_server() + result = asyncio.run(server.verify(real_request)) + for strict, loose in zip(result.follow_instruction_list, result.follow_instruction_list_loose): + assert loose >= strict + + def test_compute_metrics_four_keys(self): + server = self._create_server() + tasks = [ + [{"follow_instruction_list": [True, False], "follow_instruction_list_loose": [True, True]}], + [{"follow_instruction_list": [True], "follow_instruction_list_loose": [True]}], + ] + metrics = server.compute_metrics(tasks) + assert set(metrics.keys()) == { + "prompt_strict_accuracy", + "instruction_strict_accuracy", + "prompt_loose_accuracy", + "instruction_loose_accuracy", + } + assert metrics["prompt_strict_accuracy"] == 0.5 + assert abs(metrics["instruction_strict_accuracy"] - 2 / 3) < 1e-9 + assert metrics["prompt_loose_accuracy"] == 1.0 + assert metrics["instruction_loose_accuracy"] == 1.0 From 194c03144e88a5594ae8befc1b5bb5e7773bfcbd Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 20:50:57 -0400 Subject: [PATCH 2/4] fix(instruction_following): align loose perturbation set with IFEval/Skills spec Signed-off-by: Rod Boev --- .../instruction_following/app.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py index 02128e8c70..84f5a4763a 100644 --- a/resources_servers/instruction_following/app.py +++ b/resources_servers/instruction_following/app.py @@ -12,8 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re -import string from typing import Any, Dict, List, Literal from fastapi import FastAPI @@ -48,25 +46,30 @@ class InstructionFollowingVerifyRequest(InstructionFollowingRunRequest, BaseVeri def _get_loose_perturbations(text: str) -> list: - """Return the 8 IFEval loose-mode perturbations of a response string.""" - stripped = text.strip() - sentences = re.split(r'(?<=[.?!])\s+', stripped) - first_sentence = sentences[0] if sentences else stripped - paragraphs = re.split(r'\n\s*\n', stripped) - first_paragraph = paragraphs[0] if paragraphs else stripped - no_punct = stripped.translate(str.maketrans('', '', string.punctuation)).strip() - star_match = re.search(r'\*(.*?)\*', text, re.DOTALL) - between_stars = star_match.group(1) if star_match else text - return [ + """Return IFEval loose-mode perturbations following the NeMo Skills convention. + + Produces 4 line-removal variants of the text (original, without first line, + without last line, without first and last line), each duplicated with asterisks + removed. Empty variants are excluded. + """ + def remove_stars(s: str) -> str: + return s.replace("*", "") + + def without_first_line(s: str) -> str: + idx = s.find("\n") + return s[idx + 1 :] if idx >= 0 else "" + + def without_last_line(s: str) -> str: + idx = s.rfind("\n") + return s[:idx] if idx >= 0 else "" + + base = [ text, - stripped, - text.lower(), - first_sentence, - first_paragraph, - stripped.lower(), - no_punct, - between_stars, + without_first_line(text), + without_last_line(text), + without_last_line(without_first_line(text)), ] + return [v for s in base for v in (s, remove_stars(s)) if v.strip()] def _check_following_loose(instruction, text: str) -> bool: From 4105620cab8105164ad0f3c7257ac23efc5ec44e Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 24 Jun 2026 21:00:05 -0400 Subject: [PATCH 3/4] fix(instruction_following): scale compute_metrics to 0-100 to match Gym convention Signed-off-by: Rod Boev --- resources_servers/instruction_following/app.py | 8 ++++---- resources_servers/instruction_following/tests/test_app.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py index 84f5a4763a..ae0f6f6142 100644 --- a/resources_servers/instruction_following/app.py +++ b/resources_servers/instruction_following/app.py @@ -212,10 +212,10 @@ def _mean(lst: list) -> float: return sum(lst) / len(lst) if lst else 0.0 return { - "prompt_strict_accuracy": _mean(prompt_strict), - "instruction_strict_accuracy": _mean(instruction_strict), - "prompt_loose_accuracy": _mean(prompt_loose), - "instruction_loose_accuracy": _mean(instruction_loose), + "prompt_strict_accuracy": _mean(prompt_strict) * 100.0, + "instruction_strict_accuracy": _mean(instruction_strict) * 100.0, + "prompt_loose_accuracy": _mean(prompt_loose) * 100.0, + "instruction_loose_accuracy": _mean(instruction_loose) * 100.0, } diff --git a/resources_servers/instruction_following/tests/test_app.py b/resources_servers/instruction_following/tests/test_app.py index e178161f72..d9014a5ae6 100644 --- a/resources_servers/instruction_following/tests/test_app.py +++ b/resources_servers/instruction_following/tests/test_app.py @@ -241,7 +241,7 @@ def test_compute_metrics_four_keys(self): "prompt_loose_accuracy", "instruction_loose_accuracy", } - assert metrics["prompt_strict_accuracy"] == 0.5 - assert abs(metrics["instruction_strict_accuracy"] - 2 / 3) < 1e-9 - assert metrics["prompt_loose_accuracy"] == 1.0 - assert metrics["instruction_loose_accuracy"] == 1.0 + assert metrics["prompt_strict_accuracy"] == 50.0 + assert abs(metrics["instruction_strict_accuracy"] - 200 / 3) < 1e-9 + assert metrics["prompt_loose_accuracy"] == 100.0 + assert metrics["instruction_loose_accuracy"] == 100.0 From 26746b5e38d42c48a29785ff4939aa28ab5792e9 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Thu, 25 Jun 2026 08:43:03 -0400 Subject: [PATCH 4/4] Keep loose-mode evaluation branch consistent with repo formatting Signed-off-by: Rod Boev --- resources_servers/instruction_following/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py index ae0f6f6142..b0522a3d13 100644 --- a/resources_servers/instruction_following/app.py +++ b/resources_servers/instruction_following/app.py @@ -52,6 +52,7 @@ def _get_loose_perturbations(text: str) -> list: without last line, without first and last line), each duplicated with asterisks removed. Empty variants are excluded. """ + def remove_stars(s: str) -> str: return s.replace("*", "") @@ -74,10 +75,10 @@ def without_last_line(s: str) -> str: def _check_following_loose(instruction, text: str) -> bool: """Check instruction against native loose API or 8 perturbations.""" - if hasattr(instruction, 'check_following_loose'): + if hasattr(instruction, "check_following_loose"): return instruction.check_following_loose(text) try: - return instruction.check_following(text, mode='loose') + return instruction.check_following(text, mode="loose") except TypeError: return any(instruction.check_following(p) for p in _get_loose_perturbations(text)) @@ -187,7 +188,6 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo follow_instruction_list_loose=is_following_list_loose, ) - def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]: """Compute the four IFEval accuracy metrics over all verify responses.