From 84bbe6beeaa8654b6e4ec32379e15e004ef942f0 Mon Sep 17 00:00:00 2001 From: xjtupanda <617048176@qq.com> Date: Mon, 23 Jun 2025 16:22:46 +0800 Subject: [PATCH] fix bug --- eval/eval_hrbench.py | 2 ++ eval/eval_vstar.py | 9 ++++++--- eval/judge_result.py | 7 ++++--- eval/judge_result_hrbench.py | 7 +++++-- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/eval/eval_hrbench.py b/eval/eval_hrbench.py index 2ccccc1b9..0874e54be 100644 --- a/eval/eval_hrbench.py +++ b/eval/eval_hrbench.py @@ -13,6 +13,8 @@ import io from openai import OpenAI import requests +import copy +import pandas as pd parser = argparse.ArgumentParser() diff --git a/eval/eval_vstar.py b/eval/eval_vstar.py index 21634bb05..ed47e94f6 100644 --- a/eval/eval_vstar.py +++ b/eval/eval_vstar.py @@ -13,7 +13,7 @@ import io from openai import OpenAI import requests - +from random import shuffle parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='qwen', help='Model name for result save') @@ -127,7 +127,9 @@ def process(img_arg): anno = json.load(f) question = anno['question'] options = anno['options'] - + correct_answer = anno['options'][0] + shuffle(options) + option_str = "\n" for i in range(len(options)): option_str += abc_map[i + 1] + '. ' + options[i] + '\n' @@ -270,7 +272,8 @@ def process(img_arg): save_info = {} save_info['image'] = img save_info['question'] = question - save_info['answer'] = anno['options'][0] + save_info['answer'] = correct_answer + save_info['answer_choice'] = chr(ord('A') + options.index(correct_answer)) save_info['pred_ans'] = output_text save_info['pred_output'] = print_messages save_info['status'] = status diff --git a/eval/judge_result.py b/eval/judge_result.py index a5c267e23..d7a63177b 100644 --- a/eval/judge_result.py +++ b/eval/judge_result.py @@ -134,10 +134,11 @@ def process(line): line = line.strip() data = json.loads(line) question = data['question'] + choice = data['answer_choice'] answer = data['answer'] pred_ans = data['pred_ans'] pred_output = data['pred_output'] - answer = 'A. ' + answer + answer = f"{choice}. {answer}" if '\\boxed' in pred_ans: pred_ans = pred_ans.split('\\boxed{')[1].split('}')[0] @@ -145,12 +146,12 @@ def process(line): # rule base check acc_reward = 0.0 if len(pred_ans)==1: - if pred_ans == 'A': + if pred_ans == choice: #'A': acc_reward = 1.0 else: acc_reward = 0.0 elif len(pred_ans) == 2 and '.' in pred_ans: - if 'A' in pred_ans: + if choice in pred_ans: #'A' in pred_ans: acc_reward = 1.0 else: acc_reward = 0.0 diff --git a/eval/judge_result_hrbench.py b/eval/judge_result_hrbench.py index c6ff1a39b..fc9af6e56 100644 --- a/eval/judge_result_hrbench.py +++ b/eval/judge_result_hrbench.py @@ -36,7 +36,7 @@ else: eval_model_name = args.eval_model_name -hrbench_path = args.vstar_bench_path +hrbench_path = args.hrbench_path result_root_path = args.save_path result_root_path = os.path.join(result_root_path, args.model_name) @@ -129,6 +129,7 @@ def process(line): data = json.loads(line) question = data['question'] answer = data['answer'] + answer_str = data['answer_str'] pred_ans = data['pred_ans'] pred_output = data['pred_output'] category = data['category'] @@ -148,7 +149,9 @@ def process(line): acc_reward = 1.0 else: acc_reward = 0.0 - elif answer in pred_ans: + # elif answer in pred_ans: + # acc_reward = 1.0 + elif answer_str in pred_ans: acc_reward = 1.0 else: full_prompt = get_prompt(pred_ans, answer, question)