Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion methods/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .mapcoder import MapCoder_HumanEval, MapCoder_MBPP
from .self_consistency import SelfConsistency
from .mav import MAV_GPQA, MAV_HumanEval, MAV_Main, MAV_MATH, MAV_MMLU
from .aflow import AFlow_MATH

method2class = {
"vanilla": MAS,
Expand All @@ -38,7 +39,8 @@
"mav_humaneval": MAV_HumanEval,
"mav_main": MAV_Main,
"mav_math": MAV_MATH,
"mav_mmlu": MAV_MMLU
"mav_mmlu": MAV_MMLU,
"aflow_math": AFlow_MATH
}

def get_method_class(method_name, dataset_name=None):
Expand Down
1 change: 1 addition & 0 deletions methods/aflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .aflow_math import AFlow_MATH
708 changes: 708 additions & 0 deletions methods/aflow/aflow_math.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions methods/aflow/configs/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample: 4
max_rounds: 20
validation_rounds: 3
earlystop: True
optimize_meta_model_name: "claude-3-5-sonnet-20241022"
optimize_execute_model_name: "gpt-4o-mini-2024-07-18"
242 changes: 242 additions & 0 deletions methods/aflow/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
import threading
import regex,re,json,inspect,time
from termcolor import colored
from typing import Any, List,Tuple,Callable,Dict,Optional
from math import isclose
from pathlib import Path
from pydantic_core import to_jsonable_python
from sympy import N, simplify
from sympy.parsing.latex import parse_latex
from sympy.parsing.sympy_parser import parse_expr
from .initial_workflows.math.template.sanitize import sanitize

def write_json_file(json_file: str, data: list, encoding: str = None, indent: int = 4):
folder_path = Path(json_file).parent
if not folder_path.exists():
folder_path.mkdir(parents=True, exist_ok=True)
with open(json_file, "w", encoding=encoding) as fout:
json.dump(data, fout, ensure_ascii=False, indent=indent, default=to_jsonable_python)

def extract_model_answer(text: str) -> str:
pattern = r"\\boxed{((?:[^{}]|{[^{}]*})*)}"
boxed_matches = re.findall(pattern, text, re.DOTALL)
if boxed_matches:
return boxed_matches[-1].strip()

sentence_end_pattern = r"(?<!\d)[.!?]\s+"
sentences = re.split(sentence_end_pattern, text)
sentences = [s.strip() for s in sentences if s.strip()]
return sentences[-1] if sentences else ""


async def evaluate_math(problem: dict, graph: Callable, log_path:str) -> Tuple[str, str, str, int, float]:
input_text = problem["query"]
expected_output = problem["solution"]

try:
output = await graph(input_text)
expected_answer = extract_model_answer(expected_output)
predicted_answer = extract_model_answer(output)

if math_equal(predicted_answer, expected_answer):
uni_score, extracted_output = 1, predicted_answer
else:
uni_score, extracted_output = 0, predicted_answer

if uni_score == 0:
log_mismatch(
input_text,
expected_output,
output,
extracted_output,
extract_answer_code=get_function_code(extract_model_answer),
log_path=log_path
)
return input_text, output, expected_output, uni_score

except Exception as e:
print(colored(f"Maximum retries reached. Skipping this sample. Error: {e}","light_red"))
return input_text, str(e), expected_output, 0.0

def math_equal(prediction: Any, reference: Any) -> bool:
if str(prediction) == str(reference):
return True
try:
if is_digit(prediction) and is_digit(reference):
prediction = parse_digits(prediction)
reference = parse_digits(reference)
return isclose(prediction, reference, abs_tol=1e-3)
except:
pass

try:
return symbolic_equal(prediction, reference)
except:
pass
return False

def is_digit(num):
return parse_digits(num) is not None

def parse_digits(num):
num = regex.sub(",", "", str(num))
try:
return float(num)
except:
# When the original input is a percentage in LaTeX format (e.g., 50\%),
# a backslash remains after processing, causing the float conversion to
# fail returning None, and subsequent math operations may produce type errors.
# num = num.replace("\\%", "").replace("%", "")
if num.endswith("%"):
num = num[:-1]
if num.endswith("\\"):
num = num[:-1]
try:
return float(num) / 100
except:
pass
return None

def get_function_code(func):
try:
source_code = inspect.getsource(func)
return source_code
except OSError:
return "no code"

def symbolic_equal(a, b):
def _parse(s):
for f in [parse_latex, parse_expr]:
try:
return f(s)
except:
pass
return s

a = _parse(a)
b = _parse(b)

try:
if simplify(a - b) == 0:
return True
except:
pass

try:
if isclose(N(a), N(b), abs_tol=1e-3):
return True
except:
pass
return False
def log_mismatch(problem: str,expected_output: Any,prediction: str,extracted_output: Any,extract_answer_code: str = "None",log_path=None):
log_data = {
"question":problem,
"right_answer": expected_output,
"model_output": prediction,
"extracted_output": extracted_output,
"extract_answer_code": extract_answer_code,
}

log_file = Path(log_path) / "log.json"
if log_file.exists():
with log_file.open("r", encoding="utf-8") as f:
try:
data = json.load(f)
except json.JSONDecodeError:
data = []
else:
data = []
data.append(log_data)
write_json_file(log_file, data, encoding="utf-8", indent=4)

async def evaluate_mbpp(data: dict, graph: Callable,log_path:str) -> Tuple[str, str, str, float, float]:
input_text = data["prompt"]
expected_output = "\nCorrect Solution:\ndef " + data["code"]

try:
# Generate prediction using the graph function
prediction = await graph(input_text, data["entry_point"])
# Check the solution
ret = check_solution(prediction, data["test"], data["entry_point"])
test_case_details = ret[1]
expected_output = test_case_details + "\nCorrect Solution:" + data["code"]

# Calculate score based on the check result
score = 1.0 if ret[0] == "PASS" else 0.0

# Log mismatch if the score is 0
if score == 0:
log_mismatch(input_text, expected_output, prediction, score,log_path=log_path)

return input_text, prediction, expected_output, score

except Exception as e:
print(colored(f"Maximum retries reached. Skipping this sample. Error: {e}","light_red"))
return input_text, str(e), expected_output, 0.0

def check_solution(solution, test, entry_point):
solution = sanitize(code=solution, entrypoint=entry_point)
try:
global_dict = {
"math": __import__("math"),
"hashlib": __import__("hashlib"),
"re": __import__("re"),
"List": List,
"Dict": Dict,
"Tuple": Tuple,
"Optional": Optional,
"Any": Any,
}

exec(solution, global_dict)

if entry_point not in global_dict:
raise ValueError(f"Function {entry_point} is not defined in the solution.")

exec(test, global_dict)

check = global_dict["check"]

result = run_with_timeout(check, 15)

if result is None:
result = ("PASS", "The solution passed all test cases.")

except Exception:
result = (
"FAIL",
"Execution timed out. Please check if your solution contains infinite loops or overly time-consuming operations.",
)
except Exception as e:
error_message = f"Error: {str(e)}.\n Solution: {solution}.\n Test: {test}"
result = ("FAIL", error_message)

with open("error.log", "a", encoding="utf-8") as log_file:
log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n")

return result

def run_with_timeout(func, timeout):
result = []
stop_event = threading.Event()

def target():
try:
result.append(func())
except Exception as e:
result.append(e)
finally:
stop_event.set()

thread = threading.Thread(target=target)
thread.start()
is_timeout = not stop_event.wait(timeout)

if is_timeout:
raise Exception("Function execution timed out")

if not result:
return None
if isinstance(result[0], Exception):
raise result[0]
return result[0]
Empty file.
Empty file.
19 changes: 19 additions & 0 deletions methods/aflow/initial_workflows/math/round_1/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Literal
from ..template import operator
from ..round_1 import prompt as prompt_custom

DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"]
class Workflow:
def __init__(self,name: str,env) -> None:
self.name = name
self.llm=env
self.custom = operator.Custom(self.llm)

async def __call__(self, problem: str):
"""
Implementation of the workflow
"""
solution = await self.custom(input=problem, instruction="")
return solution['response']


Empty file.
6 changes: 6 additions & 0 deletions methods/aflow/initial_workflows/math/round_1/prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
XXX_PROMPT = """

Solve it.

"""

Empty file.
29 changes: 29 additions & 0 deletions methods/aflow/initial_workflows/math/template/op_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
SC_ENSEMBLE_PROMPT = """
Given the question described as follows: {problem}
Several solutions have been generated to address the given question. They are as follows:
{solutions}

Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.

In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
"""

PYTHON_CODE_VERIFIER_PROMPT = """
You are a professional Python programmer. Your task is to write complete, self-contained code based on a given mathematical problem and output the answer. The code should include all necessary imports and dependencies, and be ready to run without additional setup or environment configuration.

Problem description: {problem}
Other analysis: {analysis}
{feedback}

Your code should:
1. Implement the calculation steps described in the problem.
2. Define a function named `solve` that performs the calculation and returns the result. The `solve` function should not require any input parameters; instead, it should obtain all necessary inputs from within the function or from globally defined variables.
3. `solve` function return the final calculation result.

Please ensure your code is efficient, well-commented, and follows Python best practices. The output should be limited to basic data types such as strings, integers, and floats. It is prohibited to transmit images or other file formats. The code output is intended for a text-based language model.

Wrap your final code solution in <Code Solution> and </Code Solution>. For example:
<Code Solution>
Your function code here
</Code Solution>
"""
14 changes: 14 additions & 0 deletions methods/aflow/initial_workflows/math/template/operator.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"Custom": {
"description": "Generates anything based on customized input and instruction.",
"interface": "custom(input: str, instruction: str) -> dict with key 'response' of type str"
},
"ScEnsemble": {
"description": "Uses self-consistency to select the solution that appears most frequently in the solution list, improve the selection to enhance the choice of the best solution.",
"interface": "sc_ensemble(solutions: List[str], problem: str) -> dict with key 'response' of type str"
},
"Programmer": {
"description": "Automatically writes, executes Python code, and returns the solution based on the provided problem description and analysis. The `output` only contains the final answer. If you want to see the detailed solution process, it's recommended to retrieve the `code`.",
"interface": "programmer(problem: str, analysis: str = 'None') -> dict with keys 'code' and 'output' of type str"
}
}
Loading