Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion eval/chat_benchmarks/AIME24/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.seed = seed
self.n_repeat = 10

Expand Down
2 changes: 1 addition & 1 deletion eval/chat_benchmarks/AIME25/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.seed = seed
self.n_repeat = 10

Expand Down
2 changes: 1 addition & 1 deletion eval/chat_benchmarks/AIW/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.data_file = data_file
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.seed = seed
self.n_trials = n_trials

Expand Down
2 changes: 1 addition & 1 deletion eval/chat_benchmarks/AMC23/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
self.data_file = data_file
self.debug = debug
self.seed = seed
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.n_repeat = 10

def generate_responses(self, model: LM) -> Dict[str, Any]:
Expand Down
2 changes: 0 additions & 2 deletions eval/chat_benchmarks/BigCodeBench/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ def check_correctness(
def unsafe_execute(tmp_dir):
random_id = random.randint(1, 100000)
if "python" in language_type.lower():

with create_tempdir():

# These system calls are needed when cleaning up tempdir.
import os
import shutil
Expand Down
22 changes: 13 additions & 9 deletions eval/chat_benchmarks/CodeElo/codeelo_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@

import scipy.stats as stats


def rating_to_difficulty(rating):
if rating < 1000:
return 'Easy'
return "Easy"
if rating < 1300:
return 'Medium'
return "Medium"
if rating <= 3500:
return 'Hard'
return "Hard"

return "Easy"

return 'Easy'

def reliability_guard(maximum_memory_bytes: Optional[int] = None):
"""
Expand Down Expand Up @@ -250,11 +252,13 @@ def codeelo_run(problem, completion, timeout, is_extracted):
outs = tc[1]
testtype = "stdin"

test_cases.append({
"input": ins,
"output": outs,
"testtype": testtype,
})
test_cases.append(
{
"input": ins,
"output": outs,
"testtype": testtype,
}
)

manager = multiprocessing.Manager()
result = manager.list()
Expand Down
6 changes: 4 additions & 2 deletions eval/chat_benchmarks/CodeElo/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
self,
debug: bool = False,
seed: List[int] = [0, 1234, 1234, 1234],
max_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
logger: Optional[logging.Logger] = None,
system_instruction: Optional[str] = None,
):
Expand All @@ -63,7 +63,9 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.seed = seed
self.n_repeat = 3
self.filter_interaction_questions = True
Expand Down
17 changes: 9 additions & 8 deletions eval/chat_benchmarks/CodeForces/codeforces_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,19 @@

import scipy.stats as stats


def rating_to_difficulty(rating):
if not rating:
return 'Easy'
return "Easy"
if rating < 1000:
return 'Easy'
return "Easy"
if rating < 1300:
return 'Medium'
return "Medium"
if rating <= 3500:
return 'Hard'
return "Hard"

return "Easy"

return 'Easy'

def reliability_guard(maximum_memory_bytes: Optional[int] = None):
"""
Expand Down Expand Up @@ -155,8 +157,8 @@ def run_test_std(completion, test_input, test_output):
sys.stdin = io.StringIO(test_input)
try:
exec(f'__name__ = "__main__"\n{completion}' if '__name__ == "__main__"' in completion else completion, {})
out = output.getvalue().strip().replace('\n',' ').replace('\r', '')
expected = test_output.strip().replace('\n', ' ').replace('\r', '')
out = output.getvalue().strip().replace("\n", " ").replace("\r", "")
expected = test_output.strip().replace("\n", " ").replace("\r", "")

return out == expected, output.getvalue().strip()
finally:
Expand Down Expand Up @@ -247,7 +249,6 @@ def run_tests_for_one_example(test_cases, completion, result_list, is_extracted)
return



def codeforces_run(problem, completion, timeout, is_extracted):
test_cases = problem["official_tests"]
test_cases = [{**x, "testtype": "stdin"} for x in test_cases]
Expand Down
4 changes: 1 addition & 3 deletions eval/chat_benchmarks/CodeForces/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,7 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
)
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
self.seed = seed
self.n_repeat = 3
self.filter_interaction_questions = True
Expand Down
1 change: 0 additions & 1 deletion eval/chat_benchmarks/CruxEval/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def evaluate_generations(
sample_jsonl = stream_jsonl_all(input_file)

with ThreadPoolExecutor(max_workers=n_workers) as executor:

futures = []
completion_id = Counter()
n_samples = 0
Expand Down
1 change: 0 additions & 1 deletion eval/chat_benchmarks/CruxEval/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
random_id = random.randint(1, 100000)
if "python" in language_type.lower():
with create_tempdir():

# These system calls are needed when cleaning up tempdir.
import os
import shutil
Expand Down
4 changes: 3 additions & 1 deletion eval/chat_benchmarks/HLE/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def __init__(
"""
super().__init__(logger=logger, system_instruction=system_instruction)
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.seed = seed
self.n_repeat = 3

Expand Down
6 changes: 1 addition & 5 deletions eval/chat_benchmarks/HLE/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@

def get_multiple_choice_answer(pred: str):
# Try to pull out “Answer: X”, “Answer: {X}” or “Answer: \boxed{X}”
m = re.search(
r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?",
pred,
re.IGNORECASE
)
m = re.search(r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?", pred, re.IGNORECASE)
if m:
return m.group(1).upper()

Expand Down
4 changes: 3 additions & 1 deletion eval/chat_benchmarks/HMMT/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def __init__(
super().__init__(logger=logger, system_instruction=system_instruction)
self.dataset_name = dataset_name
self.debug = debug
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
self.max_new_tokens = (
max_tokens if max_tokens is not None else 32768
) # set higher to avoid truncation for reasoning models
self.seed = seed
self.n_repeat = 10

Expand Down
1 change: 0 additions & 1 deletion eval/chat_benchmarks/HMMT/matharena/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def __init__(
openai_responses=False,
**kwargs,
):

# if "think" in model and api == "google":
# logger.info("Google Think model does not allow chat.")
# is_chat = False # think model cannot handle chat
Expand Down
Loading