From 84bbe6beeaa8654b6e4ec32379e15e004ef942f0 Mon Sep 17 00:00:00 2001
From: xjtupanda <617048176@qq.com>
Date: Mon, 23 Jun 2025 16:22:46 +0800
Subject: [PATCH] fix bug

---
 eval/eval_hrbench.py         | 2 ++
 eval/eval_vstar.py           | 9 ++++++---
 eval/judge_result.py         | 7 ++++---
 eval/judge_result_hrbench.py | 7 +++++--
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/eval/eval_hrbench.py b/eval/eval_hrbench.py
index 2ccccc1b9..0874e54be 100644
--- a/eval/eval_hrbench.py
+++ b/eval/eval_hrbench.py
@@ -13,6 +13,8 @@
 import io
 from openai import OpenAI
 import requests
+import copy
+import pandas as pd
 
 
 parser = argparse.ArgumentParser()
diff --git a/eval/eval_vstar.py b/eval/eval_vstar.py
index 21634bb05..ed47e94f6 100644
--- a/eval/eval_vstar.py
+++ b/eval/eval_vstar.py
@@ -13,7 +13,7 @@
 import io
 from openai import OpenAI
 import requests
-
+from random import shuffle
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--model_name', type=str, default='qwen', help='Model name for result save')
@@ -127,7 +127,9 @@ def process(img_arg):
         anno = json.load(f)
     question = anno['question']
     options = anno['options']
-
+    correct_answer = anno['options'][0]
+    shuffle(options)
+    
     option_str = "\n"
     for i in range(len(options)):
         option_str += abc_map[i + 1] + '. ' + options[i] + '\n'
@@ -270,7 +272,8 @@ def process(img_arg):
     save_info = {}
     save_info['image'] = img
     save_info['question'] = question
-    save_info['answer'] = anno['options'][0]
+    save_info['answer'] = correct_answer
+    save_info['answer_choice'] = chr(ord('A') + options.index(correct_answer))
     save_info['pred_ans'] = output_text
     save_info['pred_output'] = print_messages
     save_info['status'] = status
diff --git a/eval/judge_result.py b/eval/judge_result.py
index a5c267e23..d7a63177b 100644
--- a/eval/judge_result.py
+++ b/eval/judge_result.py
@@ -134,10 +134,11 @@ def process(line):
     line = line.strip()
     data = json.loads(line)
     question = data['question']
+    choice = data['answer_choice']
     answer = data['answer']
     pred_ans = data['pred_ans']
     pred_output = data['pred_output']
-    answer = 'A. ' + answer
+    answer = f"{choice}. {answer}"
 
     if '\\boxed' in pred_ans:
         pred_ans = pred_ans.split('\\boxed{')[1].split('}')[0]
@@ -145,12 +146,12 @@ def process(line):
     # rule base check
     acc_reward = 0.0
     if len(pred_ans)==1:
-        if pred_ans == 'A':
+        if pred_ans == choice: #'A':
             acc_reward = 1.0
         else:
             acc_reward = 0.0
     elif len(pred_ans) == 2 and '.' in pred_ans:
-        if 'A' in pred_ans:
+        if choice in pred_ans: #'A' in pred_ans:
             acc_reward = 1.0
         else:
             acc_reward = 0.0
diff --git a/eval/judge_result_hrbench.py b/eval/judge_result_hrbench.py
index c6ff1a39b..fc9af6e56 100644
--- a/eval/judge_result_hrbench.py
+++ b/eval/judge_result_hrbench.py
@@ -36,7 +36,7 @@
 else:
     eval_model_name = args.eval_model_name
 
-hrbench_path = args.vstar_bench_path
+hrbench_path = args.hrbench_path
 result_root_path = args.save_path
 result_root_path = os.path.join(result_root_path, args.model_name)
 
@@ -129,6 +129,7 @@ def process(line):
     data = json.loads(line)
     question = data['question']
     answer = data['answer']
+    answer_str = data['answer_str']
     pred_ans = data['pred_ans']
     pred_output = data['pred_output']
     category = data['category']
@@ -148,7 +149,9 @@ def process(line):
             acc_reward = 1.0
         else:
             acc_reward = 0.0
-    elif answer in pred_ans:
+    # elif answer in pred_ans:
+    #     acc_reward = 1.0
+    elif answer_str in pred_ans:
         acc_reward = 1.0
     else:
         full_prompt = get_prompt(pred_ans, answer, question)