From 180a4857abdeda921c4363850b4e4b7b6b80684a Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Wed, 24 Jun 2026 20:35:41 -0400
Subject: [PATCH 1/4] feat(instruction_following): add loose-mode evaluation
 and compute_metrics

Signed-off-by: Rod Boev <rod.boev@gmail.com>
---
 .../instruction_following/app.py              | 81 +++++++++++++++++--
 .../instruction_following/tests/test_app.py   | 43 ++++++++++
 2 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py
index feaae8f062..02128e8c70 100644
--- a/resources_servers/instruction_following/app.py
+++ b/resources_servers/instruction_following/app.py
@@ -12,7 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Literal
+import re
+import string
+from typing import Any, Dict, List, Literal
 
 from fastapi import FastAPI
 from verifiable_instructions import instructions_registry
@@ -45,9 +47,43 @@ class InstructionFollowingVerifyRequest(InstructionFollowingRunRequest, BaseVeri
     pass
 
 
+def _get_loose_perturbations(text: str) -> list:
+    """Return the 8 IFEval loose-mode perturbations of a response string."""
+    stripped = text.strip()
+    sentences = re.split(r'(?<=[.?!])\s+', stripped)
+    first_sentence = sentences[0] if sentences else stripped
+    paragraphs = re.split(r'\n\s*\n', stripped)
+    first_paragraph = paragraphs[0] if paragraphs else stripped
+    no_punct = stripped.translate(str.maketrans('', '', string.punctuation)).strip()
+    star_match = re.search(r'\*(.*?)\*', text, re.DOTALL)
+    between_stars = star_match.group(1) if star_match else text
+    return [
+        text,
+        stripped,
+        text.lower(),
+        first_sentence,
+        first_paragraph,
+        stripped.lower(),
+        no_punct,
+        between_stars,
+    ]
+
+
+def _check_following_loose(instruction, text: str) -> bool:
+    """Check instruction against native loose API or 8 perturbations."""
+    if hasattr(instruction, 'check_following_loose'):
+        return instruction.check_following_loose(text)
+    try:
+        return instruction.check_following(text, mode='loose')
+    except TypeError:
+        return any(instruction.check_following(p) for p in _get_loose_perturbations(text))
+
+
 class InstructionFollowingVerifyResponse(BaseVerifyResponse):
     follow_all_instructions: bool
     follow_instruction_list: List[bool]
+    follow_all_instructions_loose: bool
+    follow_instruction_list_loose: List[bool]
     kwargs: List
     instruction_id_list: List
     prompt: str
@@ -102,6 +138,7 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo
         instruction_list = body.instruction_id_list
         kwargs_list = body.kwargs
         is_following_list = []
+        is_following_list_loose = []
 
         for instruction_id, kwargs in zip(instruction_list, kwargs_list):
             try:
@@ -119,16 +156,15 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo
                 # Build the instruction description with the provided kwargs
                 instruction.build_description(**filtered_kwargs)
 
-                # Check if the response follows the instruction
-                if instruction.check_following(final_response_text):
-                    is_following_list.append(True)
-                else:
-                    is_following_list.append(False)
+                # Check strict and loose from the same instruction instance
+                is_following_list.append(instruction.check_following(final_response_text))
+                is_following_list_loose.append(_check_following_loose(instruction, final_response_text))
 
             except Exception as e:
                 # If there's an error processing the instruction, mark as failed
                 print(f"Error processing instruction {instruction_id}: {e}")
                 is_following_list.append(False)
+                is_following_list_loose.append(False)
 
         # Calculate overall success
         reward_mode = getattr(body, "grading_mode", "binary")
@@ -144,8 +180,41 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo
             reward=float(reward),
             follow_all_instructions=all(is_following_list),
             follow_instruction_list=is_following_list,
+            follow_all_instructions_loose=all(is_following_list_loose),
+            follow_instruction_list_loose=is_following_list_loose,
         )
 
 
+    def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]:
+        """Compute the four IFEval accuracy metrics over all verify responses.
+
+        tasks[i] is the list of rollout dicts for task i. Each dict contains
+        follow_instruction_list and follow_instruction_list_loose.
+        """
+        prompt_strict: list = []
+        instruction_strict: list = []
+        prompt_loose: list = []
+        instruction_loose: list = []
+
+        for task_rollouts in tasks:
+            for rd in task_rollouts:
+                strict_list = rd.get("follow_instruction_list", [])
+                loose_list = rd.get("follow_instruction_list_loose", [])
+                prompt_strict.append(float(all(strict_list)) if strict_list else 0.0)
+                prompt_loose.append(float(all(loose_list)) if loose_list else 0.0)
+                instruction_strict.extend(float(v) for v in strict_list)
+                instruction_loose.extend(float(v) for v in loose_list)
+
+        def _mean(lst: list) -> float:
+            return sum(lst) / len(lst) if lst else 0.0
+
+        return {
+            "prompt_strict_accuracy": _mean(prompt_strict),
+            "instruction_strict_accuracy": _mean(instruction_strict),
+            "prompt_loose_accuracy": _mean(prompt_loose),
+            "instruction_loose_accuracy": _mean(instruction_loose),
+        }
+
+
 if __name__ == "__main__":
     InstructionFollowingResourcesServer.run_webserver()
diff --git a/resources_servers/instruction_following/tests/test_app.py b/resources_servers/instruction_following/tests/test_app.py
index 7e4f4590e5..e178161f72 100644
--- a/resources_servers/instruction_following/tests/test_app.py
+++ b/resources_servers/instruction_following/tests/test_app.py
@@ -202,3 +202,46 @@ def test_fractional_reward_half(self):
             grading_mode="fraction",
         )
         self._run_verify_test(real_request, False, 0.5, [True, False])
+
+    def test_loose_fields_present(self):
+        real_request = self._create_real_request(
+            instruction_ids=["punctuation:no_comma"],
+            prompt="The output should not contain any commas.",
+            kwargs=[{}],
+            response_content="Hello world without commas",
+        )
+        server = self._create_server()
+        result = asyncio.run(server.verify(real_request))
+        assert isinstance(result.follow_all_instructions_loose, bool)
+        assert isinstance(result.follow_instruction_list_loose, list)
+        assert len(result.follow_instruction_list_loose) == 1
+
+    def test_loose_geq_strict(self):
+        real_request = self._create_real_request(
+            instruction_ids=["punctuation:no_comma"],
+            prompt="No commas please.",
+            kwargs=[{}],
+            response_content="Hello, world",
+        )
+        server = self._create_server()
+        result = asyncio.run(server.verify(real_request))
+        for strict, loose in zip(result.follow_instruction_list, result.follow_instruction_list_loose):
+            assert loose >= strict
+
+    def test_compute_metrics_four_keys(self):
+        server = self._create_server()
+        tasks = [
+            [{"follow_instruction_list": [True, False], "follow_instruction_list_loose": [True, True]}],
+            [{"follow_instruction_list": [True], "follow_instruction_list_loose": [True]}],
+        ]
+        metrics = server.compute_metrics(tasks)
+        assert set(metrics.keys()) == {
+            "prompt_strict_accuracy",
+            "instruction_strict_accuracy",
+            "prompt_loose_accuracy",
+            "instruction_loose_accuracy",
+        }
+        assert metrics["prompt_strict_accuracy"] == 0.5
+        assert abs(metrics["instruction_strict_accuracy"] - 2 / 3) < 1e-9
+        assert metrics["prompt_loose_accuracy"] == 1.0
+        assert metrics["instruction_loose_accuracy"] == 1.0

From 194c03144e88a5594ae8befc1b5bb5e7773bfcbd Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Wed, 24 Jun 2026 20:50:57 -0400
Subject: [PATCH 2/4] fix(instruction_following): align loose perturbation set
 with IFEval/Skills spec

Signed-off-by: Rod Boev <rod.boev@gmail.com>
---
 .../instruction_following/app.py              | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py
index 02128e8c70..84f5a4763a 100644
--- a/resources_servers/instruction_following/app.py
+++ b/resources_servers/instruction_following/app.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import re
-import string
 from typing import Any, Dict, List, Literal
 
 from fastapi import FastAPI
@@ -48,25 +46,30 @@ class InstructionFollowingVerifyRequest(InstructionFollowingRunRequest, BaseVeri
 
 
 def _get_loose_perturbations(text: str) -> list:
-    """Return the 8 IFEval loose-mode perturbations of a response string."""
-    stripped = text.strip()
-    sentences = re.split(r'(?<=[.?!])\s+', stripped)
-    first_sentence = sentences[0] if sentences else stripped
-    paragraphs = re.split(r'\n\s*\n', stripped)
-    first_paragraph = paragraphs[0] if paragraphs else stripped
-    no_punct = stripped.translate(str.maketrans('', '', string.punctuation)).strip()
-    star_match = re.search(r'\*(.*?)\*', text, re.DOTALL)
-    between_stars = star_match.group(1) if star_match else text
-    return [
+    """Return IFEval loose-mode perturbations following the NeMo Skills convention.
+
+    Produces 4 line-removal variants of the text (original, without first line,
+    without last line, without first and last line), each duplicated with asterisks
+    removed. Empty variants are excluded.
+    """
+    def remove_stars(s: str) -> str:
+        return s.replace("*", "")
+
+    def without_first_line(s: str) -> str:
+        idx = s.find("\n")
+        return s[idx + 1 :] if idx >= 0 else ""
+
+    def without_last_line(s: str) -> str:
+        idx = s.rfind("\n")
+        return s[:idx] if idx >= 0 else ""
+
+    base = [
         text,
-        stripped,
-        text.lower(),
-        first_sentence,
-        first_paragraph,
-        stripped.lower(),
-        no_punct,
-        between_stars,
+        without_first_line(text),
+        without_last_line(text),
+        without_last_line(without_first_line(text)),
     ]
+    return [v for s in base for v in (s, remove_stars(s)) if v.strip()]
 
 
 def _check_following_loose(instruction, text: str) -> bool:

From 4105620cab8105164ad0f3c7257ac23efc5ec44e Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Wed, 24 Jun 2026 21:00:05 -0400
Subject: [PATCH 3/4] fix(instruction_following): scale compute_metrics to
 0-100 to match Gym convention

Signed-off-by: Rod Boev <rod.boev@gmail.com>
---
 resources_servers/instruction_following/app.py            | 8 ++++----
 resources_servers/instruction_following/tests/test_app.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py
index 84f5a4763a..ae0f6f6142 100644
--- a/resources_servers/instruction_following/app.py
+++ b/resources_servers/instruction_following/app.py
@@ -212,10 +212,10 @@ def _mean(lst: list) -> float:
             return sum(lst) / len(lst) if lst else 0.0
 
         return {
-            "prompt_strict_accuracy": _mean(prompt_strict),
-            "instruction_strict_accuracy": _mean(instruction_strict),
-            "prompt_loose_accuracy": _mean(prompt_loose),
-            "instruction_loose_accuracy": _mean(instruction_loose),
+            "prompt_strict_accuracy": _mean(prompt_strict) * 100.0,
+            "instruction_strict_accuracy": _mean(instruction_strict) * 100.0,
+            "prompt_loose_accuracy": _mean(prompt_loose) * 100.0,
+            "instruction_loose_accuracy": _mean(instruction_loose) * 100.0,
         }
 
 
diff --git a/resources_servers/instruction_following/tests/test_app.py b/resources_servers/instruction_following/tests/test_app.py
index e178161f72..d9014a5ae6 100644
--- a/resources_servers/instruction_following/tests/test_app.py
+++ b/resources_servers/instruction_following/tests/test_app.py
@@ -241,7 +241,7 @@ def test_compute_metrics_four_keys(self):
             "prompt_loose_accuracy",
             "instruction_loose_accuracy",
         }
-        assert metrics["prompt_strict_accuracy"] == 0.5
-        assert abs(metrics["instruction_strict_accuracy"] - 2 / 3) < 1e-9
-        assert metrics["prompt_loose_accuracy"] == 1.0
-        assert metrics["instruction_loose_accuracy"] == 1.0
+        assert metrics["prompt_strict_accuracy"] == 50.0
+        assert abs(metrics["instruction_strict_accuracy"] - 200 / 3) < 1e-9
+        assert metrics["prompt_loose_accuracy"] == 100.0
+        assert metrics["instruction_loose_accuracy"] == 100.0

From 26746b5e38d42c48a29785ff4939aa28ab5792e9 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Thu, 25 Jun 2026 08:43:03 -0400
Subject: [PATCH 4/4] Keep loose-mode evaluation branch consistent with repo
 formatting

Signed-off-by: Rod Boev <rod.boev@gmail.com>
---
 resources_servers/instruction_following/app.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources_servers/instruction_following/app.py b/resources_servers/instruction_following/app.py
index ae0f6f6142..b0522a3d13 100644
--- a/resources_servers/instruction_following/app.py
+++ b/resources_servers/instruction_following/app.py
@@ -52,6 +52,7 @@ def _get_loose_perturbations(text: str) -> list:
     without last line, without first and last line), each duplicated with asterisks
     removed. Empty variants are excluded.
     """
+
     def remove_stars(s: str) -> str:
         return s.replace("*", "")
 
@@ -74,10 +75,10 @@ def without_last_line(s: str) -> str:
 
 def _check_following_loose(instruction, text: str) -> bool:
     """Check instruction against native loose API or 8 perturbations."""
-    if hasattr(instruction, 'check_following_loose'):
+    if hasattr(instruction, "check_following_loose"):
         return instruction.check_following_loose(text)
     try:
-        return instruction.check_following(text, mode='loose')
+        return instruction.check_following(text, mode="loose")
     except TypeError:
         return any(instruction.check_following(p) for p in _get_loose_perturbations(text))
 
@@ -187,7 +188,6 @@ async def verify(self, body: InstructionFollowingVerifyRequest) -> InstructionFo
             follow_instruction_list_loose=is_following_list_loose,
         )
 
-
     def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]:
         """Compute the four IFEval accuracy metrics over all verify responses.