From 257ae85db04b28b1fef5cd6db593d55743f7d2a5 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 11 May 2026 11:04:28 +0200 Subject: [PATCH 1/2] fix(repl_env): handle nested parens in FINAL(...) extraction Switch the FINAL(...) regex from lazy '.*?' to greedy '.*' anchored to line start/end (re.MULTILINE | re.DOTALL), matching the official RLM fix in alex-zhang13/rlm#75. The lazy pattern stopped at the first ')' and would mis-extract answers containing nested parentheses, e.g. FINAL(2^(2^(2^(2))) = 65536) was truncated to '2^(2^(2^(2'. --- envs/repl_env/runner.py | 7 +++++-- envs/repl_env/server/repl_environment.py | 7 +++++-- tests/envs/test_repl_env.py | 24 ++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/envs/repl_env/runner.py b/envs/repl_env/runner.py index 226b26657..2f9811e0e 100644 --- a/envs/repl_env/runner.py +++ b/envs/repl_env/runner.py @@ -243,8 +243,11 @@ def _default_answer( ] try: response = self._chat(final_prompt, model) - # Try to extract FINAL(...) from the response - match = re.search(r"FINAL\((.*?)\)", response, re.DOTALL) + # Try to extract FINAL(...) from the response. Greedy + line-anchored + # so nested parentheses are captured fully (matches official RLM). + match = re.search( + r"^\s*FINAL\((.*)\)\s*$", response, re.MULTILINE | re.DOTALL + ) if match: return match.group(1).strip() # If no FINAL pattern, return the raw response as best-effort diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py index f2e6f5d98..d9eee6845 100644 --- a/envs/repl_env/server/repl_environment.py +++ b/envs/repl_env/server/repl_environment.py @@ -536,8 +536,11 @@ def _extract_final_answer(self, stdout: str) -> Optional[str]: Returns: Final answer string or None if not found """ - # Pattern 1: RLM-style FINAL(answer) - final_match = re.search(r"FINAL\((.*?)\)", stdout, re.DOTALL) + # Pattern 1: RLM-style FINAL(answer). Greedy + line-anchored so nested + # parentheses (e.g. FINAL(f(x)), FINAL((1, 2, 3))) are captured fully. + final_match = re.search( + r"^\s*FINAL\((.*)\)\s*$", stdout, re.MULTILINE | re.DOTALL + ) if final_match: return final_match.group(1).strip() diff --git a/tests/envs/test_repl_env.py b/tests/envs/test_repl_env.py index 4811119fb..14fea8426 100644 --- a/tests/envs/test_repl_env.py +++ b/tests/envs/test_repl_env.py @@ -229,6 +229,30 @@ def test_final_pattern_basic(self): assert obs.done assert obs.metadata["final_answer"] == "42" + @pytest.mark.parametrize( + "code, expected", + [ + # Nested function calls inside FINAL(...). + ("print('FINAL(f(x))')", "f(x)"), + # Tuple as the final answer. + ("print('FINAL((1, 2, 3))')", "(1, 2, 3)"), + # Math expression with multiple nested parens (e2b_repl_example). + ( + "print('FINAL(2^(2^(2^(2))) = 65536)')", + "2^(2^(2^(2))) = 65536", + ), + # Dict containing a tuple value. + ("print(\"FINAL({'a': (1, 2)})\")", "{'a': (1, 2)}"), + ], + ) + def test_final_pattern_nested_parentheses(self, code, expected): + """FINAL(...) extraction must handle nested parentheses (rlm #75).""" + env = REPLEnvironment() + env.reset() + obs = env.step(REPLAction(code=code)) + assert obs.done + assert obs.metadata["final_answer"] == expected + def test_final_var_pattern(self): """Test FINAL_VAR() pattern.""" env = REPLEnvironment() From 52fa299c3bc10c40f2e6d0d9daefb4078c6570f6 Mon Sep 17 00:00:00 2001 From: sergiopaniego Date: Mon, 18 May 2026 11:27:13 +0200 Subject: [PATCH 2/2] update based on reviews --- envs/repl_env/runner.py | 19 ++++++++++++------- envs/repl_env/server/repl_environment.py | 19 ++++++++++++------- tests/envs/test_repl_env.py | 2 ++ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/envs/repl_env/runner.py b/envs/repl_env/runner.py index 2f9811e0e..bf37d0e7b 100644 --- a/envs/repl_env/runner.py +++ b/envs/repl_env/runner.py @@ -243,13 +243,18 @@ def _default_answer( ] try: response = self._chat(final_prompt, model) - # Try to extract FINAL(...) from the response. Greedy + line-anchored - # so nested parentheses are captured fully (matches official RLM). - match = re.search( - r"^\s*FINAL\((.*)\)\s*$", response, re.MULTILINE | re.DOTALL - ) - if match: - return match.group(1).strip() + # Try to extract FINAL(...) from the response. Paren-counting handles + # nested parens and mid-sentence FINAL without regex flag trade-offs. + idx = response.find("FINAL(") + if idx != -1: + depth, start = 0, idx + len("FINAL") + for i, ch in enumerate(response[idx + len("FINAL"):], start=idx + len("FINAL")): + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0: + return response[start + 1 : i].strip() # If no FINAL pattern, return the raw response as best-effort return response.strip() if response.strip() else None except Exception: diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py index d9eee6845..8ba645a44 100644 --- a/envs/repl_env/server/repl_environment.py +++ b/envs/repl_env/server/repl_environment.py @@ -536,13 +536,18 @@ def _extract_final_answer(self, stdout: str) -> Optional[str]: Returns: Final answer string or None if not found """ - # Pattern 1: RLM-style FINAL(answer). Greedy + line-anchored so nested - # parentheses (e.g. FINAL(f(x)), FINAL((1, 2, 3))) are captured fully. - final_match = re.search( - r"^\s*FINAL\((.*)\)\s*$", stdout, re.MULTILINE | re.DOTALL - ) - if final_match: - return final_match.group(1).strip() + # Pattern 1: RLM-style FINAL(answer). Paren-counting handles nested + # parens (e.g. FINAL(f(x))) and multi-line values without regex flag trade-offs. + idx = stdout.find("FINAL(") + if idx != -1: + depth, start = 0, idx + len("FINAL") + for i, ch in enumerate(stdout[idx + len("FINAL"):], start=idx + len("FINAL")): + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0: + return stdout[start + 1 : i].strip() # Pattern 2: RLM-style FINAL_VAR(variable_name) final_var_match = re.search(r"FINAL_VAR\((\w+)\)", stdout) diff --git a/tests/envs/test_repl_env.py b/tests/envs/test_repl_env.py index 14fea8426..b8cb8311e 100644 --- a/tests/envs/test_repl_env.py +++ b/tests/envs/test_repl_env.py @@ -243,6 +243,8 @@ def test_final_pattern_basic(self): ), # Dict containing a tuple value. ("print(\"FINAL({'a': (1, 2)})\")", "{'a': (1, 2)}"), + # Output after FINAL must not bleed into the extracted answer. + ("print('FINAL(42)\\nresult: (ok)')", "42"), ], ) def test_final_pattern_nested_parentheses(self, code, expected):