mlfoundations · neginraoof · Jun 5, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -44,7 +44,9 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768    # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 10
 

diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 10
 

diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_trials = n_trials
 

diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:

diff --git a/eval/chat_benchmarks/BigCodeBench/execution.py b/eval/chat_benchmarks/BigCodeBench/execution.py
@@ -60,9 +60,7 @@ def check_correctness(
     def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
-
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil

diff --git a/eval/chat_benchmarks/CodeElo/codeelo_utils.py b/eval/chat_benchmarks/CodeElo/codeelo_utils.py
@@ -18,15 +18,17 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -250,11 +252,13 @@ def codeelo_run(problem, completion, timeout, is_extracted):
         outs = tc[1]
         testtype = "stdin"
 
-        test_cases.append({
-            "input": ins,
-            "output": outs,
-            "testtype": testtype,
-        })
+        test_cases.append(
+            {
+                "input": ins,
+                "output": outs,
+                "testtype": testtype,
+            }
+        )
 
     manager = multiprocessing.Manager()
     result = manager.list()

diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int]  = None,
+        max_tokens: Optional[int] = None,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,7 +63,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True

diff --git a/eval/chat_benchmarks/CodeForces/codeforces_utils.py b/eval/chat_benchmarks/CodeForces/codeforces_utils.py
@@ -18,17 +18,19 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if not rating:
-        return 'Easy'
+        return "Easy"
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -155,8 +157,8 @@ def run_test_std(completion, test_input, test_output):
         sys.stdin = io.StringIO(test_input)
         try:
             exec(f'__name__ = "__main__"\n{completion}' if '__name__ == "__main__"' in completion else completion, {})
-            out = output.getvalue().strip().replace('\n',' ').replace('\r', '')
-            expected = test_output.strip().replace('\n', ' ').replace('\r', '')
+            out = output.getvalue().strip().replace("\n", " ").replace("\r", "")
+            expected = test_output.strip().replace("\n", " ").replace("\r", "")
 
             return out == expected, output.getvalue().strip()
         finally:
@@ -247,7 +249,6 @@ def run_tests_for_one_example(test_cases, completion, result_list, is_extracted)
             return
 
 
-
 def codeforces_run(problem, completion, timeout, is_extracted):
     test_cases = problem["official_tests"]
     test_cases = [{**x, "testtype": "stdin"} for x in test_cases]

diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True

diff --git a/eval/chat_benchmarks/CruxEval/evaluation.py b/eval/chat_benchmarks/CruxEval/evaluation.py
@@ -72,7 +72,6 @@ def evaluate_generations(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0

diff --git a/eval/chat_benchmarks/CruxEval/execution.py b/eval/chat_benchmarks/CruxEval/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil

diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py
@@ -77,7 +77,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
 

diff --git a/eval/chat_benchmarks/HLE/testing_utils.py b/eval/chat_benchmarks/HLE/testing_utils.py
@@ -7,11 +7,7 @@
 
 def get_multiple_choice_answer(pred: str):
     # Try to pull out “Answer: X”, “Answer: {X}” or “Answer: \boxed{X}”
-    m = re.search(
-        r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?",
-        pred, 
-        re.IGNORECASE
-    )
+    m = re.search(r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?", pred, re.IGNORECASE)
     if m:
         return m.group(1).upper()
 

diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -47,7 +47,9 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.dataset_name = dataset_name
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 10
 

diff --git a/eval/chat_benchmarks/HMMT/matharena/api.py b/eval/chat_benchmarks/HMMT/matharena/api.py
@@ -48,7 +48,6 @@ def __init__(
         openai_responses=False,
         **kwargs,
     ):
-
         # if "think" in model and api == "google":
         #     logger.info("Google Think model does not allow chat.")
         #     is_chat = False # think model cannot handle chat