diff --git a/methods/ADAS/__init__.py b/methods/ADAS/__init__.py new file mode 100644 index 0000000..f30d74c --- /dev/null +++ b/methods/ADAS/__init__.py @@ -0,0 +1,5 @@ +from .adas_drop import ADAS_DROP +from .adas_gpqa import ADAS_GPQA +from .adas_mgsm import ADAS_MGSM +from .adas_mmlu import ADAS_MMLU +from .adas_math import ADAS_MATH \ No newline at end of file diff --git a/methods/ADAS/adas_drop.py b/methods/ADAS/adas_drop.py new file mode 100644 index 0000000..cd6d5f9 --- /dev/null +++ b/methods/ADAS/adas_drop.py @@ -0,0 +1,392 @@ +import os +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import random_id, bootstrap_confidence_interval, load_drop, drop_metric + +import copy +import json +import os +import random +from pathlib import Path +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import openai +import argparse +from tqdm import tqdm +from .prompt.drop_prompt import get_init_archive, get_prompt, get_reflexion_prompt + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" + +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True +execute_model = None +api_key = None +base_url = None +optimize_execute_token_stats = {} +inference_execute_token_stats = {} + +def merge_token_stats(target, *sources): + for d in sources: + for model_key, stats in d.items(): + for k, v in stats.items(): + target.setdefault(model_key, {}).setdefault(k, 0) + target[model_key][k] += v + return target + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = execute_model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + self.client = openai.OpenAI(api_key=api_key, base_url=base_url) + + def get_json_response_from_gpt( + self, + msg, + model, + system_message, + temperature=0 + ): + response = self.client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Directly answer the question. Keep it very concise." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = self.get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_DROP(MAS): + def __init__(self, general_config, method_config_name): + super().__init__(general_config) + # set the meta model and execute model for optimizing mode + self.model_api_config = general_config["model_api_config"] + self.optimize_execute_model = general_config['optimize_execute_model_name'] + self.optimize_meta_model = general_config['optimize_meta_model_name'] + global execute_model, api_key, base_url + execute_model = self.optimize_execute_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + self.inference_model = general_config['model_name'] + + self.config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.domain = method_config_name + self.valid_size = self.config["valid_size"] + self.test_size = self.config["test_size"] + self.shuffle_seed = self.config["shuffle_seed"] + self.n_repreat = self.config["n_repreat"] + self.multiprocessing = self.config["multiprocessing"] + self.debug = self.config["debug"] + + self.save_dir = Path(__file__).parents[2] / "results" / self.domain / f"adas_{self.optimize_meta_model}_optimize_{self.optimize_execute_model}_execute" + self.optimizing_path = self.save_dir / "archive.json" + self.inference_path = self.save_dir / "best_workflow.json" + self.n_generation = self.config["n_generation"] + self.debug_max = self.config["debug_max"] + self.args = argparse.Namespace(**self.config) + + + def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages, model_name=self.optimize_meta_model) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, data_path, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + # set seed 0 for valid set + examples = load_drop(data_path)[1:-1] # first one and the last one is for few-shot examples + random.seed(self.shuffle_seed) + random.shuffle(examples) + + if searching_mode: + examples = examples[:self.valid_size] * self.n_repreat + else: + examples = examples[self.valid_size:self.valid_size + self.test_size] * self.n_repreat + + questions = [example['inputs'] for example in examples] + answers = [example['targets'] for example in examples] + + print(f"problem length: {len(examples)}") + max_workers = min(len(examples), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in questions: + taskInfo = Info('task', 'User', q, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for q_idx, res in enumerate(results): + try: + if isinstance(res, Info): + extracted_answer = res.content + else: + extracted_answer = res + correct_answers = answers[q_idx] + em_score, f1_score = drop_metric(extracted_answer, correct_answers) + except Exception as e: + acc_list.append(0) + continue + + acc_list.append(f1_score) + + print(f"f1: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # create save dir + os.makedirs(os.path.dirname(self.optimizing_path), exist_ok=True) + os.makedirs(os.path.dirname(self.inference_path), exist_ok=True) + + best_fitness = 0 + # training + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + for solution in archive: + if 'fitness' in solution: + continue + + solution['generation'] = "initial" + print(f"============Initial Archive: {solution['name']}=================") + try: + acc_list = self.evaluate_forward_fn(solution["code"],data_path=val_dataset) + except Exception as e: + print("During evaluating initial archive:") + print(e) + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + solution['fitness'] = fitness_str + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = solution + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + print(f"============Generation {n + 1}=================") + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + try: + next_solution = self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = self.call_llm(messages=msg_list) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = self.call_llm(messages=msg_list) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + except Exception as e: + print("During optimizing:") + print(e) + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], data_path=val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = self.call_llm(msg_list) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + with open(self.inference_path, 'w') as json_file: + json.dump(best_solution, json_file, indent=4) + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + print(f'execute model: {execute_model}') + if not os.path.exists(self.inference_path): + raise NotImplementedError("The specified best workflow path does not exist.") + with open(self.inference_path, 'r') as json_file: + best_solution = json.load(json_file) + # print(best_solution) + + namespace = {} + exec(best_solution["code"], globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + agentSystem = AgentSystem() + + taskInfo = Info('task', 'User', query, -1) + response = agentSystem.forward(taskInfo) + if isinstance(response, str): + response = response + else: + response = response.content + + merge_token_stats(self.token_stats, optimize_execute_token_stats, inference_execute_token_stats) + + return response diff --git a/methods/ADAS/adas_gpqa.py b/methods/ADAS/adas_gpqa.py new file mode 100644 index 0000000..898c46f --- /dev/null +++ b/methods/ADAS/adas_gpqa.py @@ -0,0 +1,420 @@ +import argparse +import openai +import os +import copy +import json +import random + +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor +import numpy as np +from pathlib import Path +from tqdm import tqdm + +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import random_id, bootstrap_confidence_interval +from .prompt.gpqa_prompt import get_init_archive, get_prompt, get_reflexion_prompt + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" + +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True +execute_model = None +api_key = "Please_set_your_own_key" +base_url = "Please_set_your_own_base_url" +optimize_execute_token_stats = {} +inference_execute_token_stats = {} + +def merge_token_stats(target, *sources): + for d in sources: + for model_key, stats in d.items(): + for k, v in stats.items(): + target.setdefault(model_key, {}).setdefault(k, 0) + target[model_key][k] += v + return target + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + """ + Attributes: + """ + + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = execute_model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + self.client = openai.OpenAI(api_key=api_key, base_url=base_url) + + def get_json_response_from_gpt( + self, + msg, + model, + system_message, + temperature=0 + ): + response = self.client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = self.get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_GPQA(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + # set the meta model and execute model for optimizing mode + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + self.optimize_execute_model = general_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.optimize_meta_model = general_config.get('optimize_meta_model_name','gpt-4o') + global execute_model, api_key, base_url + execute_model = self.optimize_execute_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + self.inference_model = general_config['model_name'] + + self.config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.domain = method_config_name + self.valid_size = self.config["valid_size"] + self.test_size = self.config["test_size"] + self.shuffle_seed = self.config["shuffle_seed"] + self.n_repreat = self.config["n_repreat"] + self.multiprocessing = self.config["multiprocessing"] + self.debug = self.config["debug"] + + self.save_dir = Path(__file__).parents[2] / "results" / self.domain / f"adas_{self.optimize_meta_model}_optimize_{self.optimize_execute_model}_execute" + self.optimizing_path = self.save_dir / "archive.json" + self.inference_path = self.save_dir / "best_workflow.json" + self.n_generation = self.config["n_generation"] + self.debug_max = self.config["debug_max"] + self.self = argparse.Namespace(**self.config) + + + def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages, model_name=self.optimize_meta_model) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, val_dataset, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + LETTER_TO_INDEX = {'A': 0, 'B': 1, 'C': 2, 'D': 3} + # set seed 0 for valid set + random.seed(self.shuffle_seed) + examples = random.sample(val_dataset, len(val_dataset)) + random.shuffle(examples) + + if searching_mode: + val_questions = examples[:self.valid_size] * self.n_repreat + else: + val_questions = examples[self.valid_size:] * self.n_repreat + + print(f"problem length: {len(val_questions)}") + max_workers = min(len(val_questions), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in val_questions: + task_content = f"What is the correct answer to this question: {q['query']}" \ + + f"\n\nChoices:\n(A) {q['choices'][0]}\n(B) {q['choices'][1]}\n(C) {q['choices'][2]}\n(D) {q['choices'][3]}" + taskInfo = Info('task', 'User', task_content, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for q_idx, res in enumerate(results): + try: + if isinstance(res, str) and res in LETTER_TO_INDEX: + predicted_idx = LETTER_TO_INDEX[res] + elif 'A)' in res: + predicted_idx = 0 + elif 'B)' in res: + predicted_idx = 1 + elif 'C)' in res: + predicted_idx = 2 + elif 'D)' in res: + predicted_idx = 3 + elif isinstance(res, list): + try_res = res[1] + predicted_idx = LETTER_TO_INDEX[try_res.content] + elif res.content in LETTER_TO_INDEX: + predicted_idx = LETTER_TO_INDEX[res.content] + elif 'A)' in res.content: + predicted_idx = 0 + elif 'B)' in res.content: + predicted_idx = 1 + elif 'C)' in res.content: + predicted_idx = 2 + elif 'D)' in res.content: + predicted_idx = 3 + else: + print(f"error in q {q_idx}") + acc_list.append(0) + continue + except Exception as e: + acc_list.append(0) + continue + + if predicted_idx == val_questions[q_idx]['correct_index']: + acc_list.append(1) + else: + acc_list.append(0) + print(f"acc: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # create save dir + os.makedirs(os.path.dirname(self.optimizing_path), exist_ok=True) + os.makedirs(os.path.dirname(self.inference_path), exist_ok=True) + + best_fitness = 0 + # training + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + for solution in archive: + if 'fitness' in solution: + continue + + solution['generation'] = "initial" + print(f"============Initial Archive: {solution['name']}=================") + # self.dynamic_forward(solution["code"]) + try: + acc_list = self.evaluate_forward_fn(solution["code"],val_dataset) + except Exception as e: + print("During evaluating initial archive:") + print(e) + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + solution['fitness'] = fitness_str + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = solution + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + print(f"============Generation {n + 1}=================") + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + try: + next_solution = self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = self.call_llm(messages=msg_list) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = self.call_llm(messages=msg_list) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + except Exception as e: + print("During optimizing:") + print(e) + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = self.call_llm(msg_list) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + with open(self.inference_path, 'w') as json_file: + json.dump(best_solution, json_file, indent=4) + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + if not os.path.exists(self.inference_path): + raise NotImplementedError("The specified best workflow path does not exist.") + with open(self.inference_path, 'r') as json_file: + best_solution = json.load(json_file) + # print(best_solution) + + namespace = {} + exec(best_solution["code"], globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + agentSystem = AgentSystem() + + taskInfo = Info('task', 'User', query, -1) + response = agentSystem.forward(taskInfo) + if isinstance(response, str): + response = response + else: + response = response.content + + merge_token_stats(self.token_stats, optimize_execute_token_stats, inference_execute_token_stats) + + return response \ No newline at end of file diff --git a/methods/ADAS/adas_math.py b/methods/ADAS/adas_math.py new file mode 100644 index 0000000..6588f12 --- /dev/null +++ b/methods/ADAS/adas_math.py @@ -0,0 +1,387 @@ +# This is actually the version adapted to the MATH-500 +import argparse +import copy +import json +import numpy as np +import openai +import os +import random + +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from termcolor import colored +from tqdm import tqdm + +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import random_id, bootstrap_confidence_interval +from .prompt.main_prompt import get_init_archive, get_prompt, get_reflexion_prompt +from .evaluate_math_aflow import grade_answer + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" +optimize_execute_token_stats = {} +inference_execute_token_stats = {} +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True + +client = openai.OpenAI(api_key='Your api key', base_url='Your base url') + +def get_json_response_from_gpt(msg,model,system_message,temperature=0): + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + """ + Attributes: + """ + + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', model='gpt-4o-mini-2024-07-18', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_MATH(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + self.dataset_name = general_config['test_dataset_name'] + self.model_name_optimize = self.method_config.get('optimize_meta_model_name','gpt-4o-2024-08-06') + self.model_name_execute = self.method_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + global execute_model, api_key, base_url + execute_model = self.model_name_execute + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.inference_model = general_config['model_name'] + + self.valid_size = self.method_config["valid_size"] + self.test_size = self.method_config["test_size"] + self.shuffle_seed = self.method_config["shuffle_seed"] + self.n_repreat = self.method_config["n_repreat"] + self.multiprocessing = self.method_config["multiprocessing"] + self.max_workers = self.method_config["max_workers"] + self.debug = self.method_config["debug"] + + self.results_path = f"results/{self.dataset_name}/adas/{self.model_name_optimize}/{self.model_name_execute}" + self.n_generation = self.method_config["n_generation"] + self.debug_max = self.method_config["debug_max"] + self.args = argparse.Namespace(**self.method_config) + + + def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages,model_name=self.model_name_optimize) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, val_dataset, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + # set seed 0 for valid set + random.seed(self.shuffle_seed) + + examples = random.sample(val_dataset, len(val_dataset)) + + if searching_mode: + examples = examples[:self.valid_size] * self.n_repreat + else: + examples = examples[self.valid_size:self.valid_size + self.test_size] * self.n_repreat + + questions = [example['query'] for example in examples] + answers = [example['solution'] for example in examples] + + print(f"problem length: {len(examples)}") + max_workers = min(len(examples), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in questions: + taskInfo = Info('task', 'User', q, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for i,result in enumerate(results): + if not isinstance(result,str): + result = result.content + if grade_answer(str(result), answers[i]): + acc_list.append(1) + else: + acc_list.append(0) + print(f"acc: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # The original paper did not use the MATH dataset, here we use AFlow's MATH_val. + optimized_path = Path(self.results_path) / "best_forward.txt" + if optimized_path.exists(): + print(colored("The optimal forward function already exists!\n","red")) + return + best_fitness = 0 + # training + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + for solution in archive: + if 'fitness' in solution: + continue + + solution['generation'] = "initial" + print(colored(f"============Initial Archive: {solution['name']}=================","yellow")) + # self.dynamic_forward(solution["code"]) + try: + acc_list = self.evaluate_forward_fn(solution["code"],val_dataset) + except Exception as e: + print("During evaluating initial archive:") + print(e) + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + solution['fitness'] = fitness_str + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = solution + archive_path = os.path.join(self.results_path,"archive.json") + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + with open(archive_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + print(colored(f"============Generation {n + 1}=================","light_cyan")) + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + try: + next_solution = self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = self.call_llm(messages=msg_list) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = self.call_llm(messages=msg_list) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + + except Exception as e: + print(f"During optimizing:\n{e}") + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = self.call_llm(msg_list) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + archive_path = os.path.join(self.results_path,"archive.json") + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + with open(archive_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + forward_path = os.path.join(self.results_path,"best_forward.txt") + with open(forward_path,"w") as f: + f.write(best_solution['code']) + print(colored("Optimization complete!","green")) + print(colored(f"\n>> Optimization token stats: {self.get_token_stats()}","light_yellow")) + token_path = os.path.join(self.results_path,"api_token.json") + with open(token_path,"a") as f: + json.dump(optimize_execute_token_stats, f, indent=4) + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + optimized_path = Path(self.results_path) / "best_forward.txt" + if optimized_path.exists(): + with open(optimized_path,"r") as f: + forward_str = f.read() + else: + raise NotImplementedError("Best_forward function does not exist!") + + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + taskInfo = Info('task', 'User', query, -1) + agentSystem = AgentSystem() + response = agentSystem.forward(taskInfo) + if not isinstance(response,str): + response = response.content + + self.token_stats[self.model_name]["num_llm_calls"] = inference_execute_token_stats[self.model_name]["num_llm_calls"] + self.token_stats[self.model_name]["prompt_tokens"] = inference_execute_token_stats[self.model_name]["prompt_tokens"] + self.token_stats[self.model_name]["completion_tokens"] = inference_execute_token_stats[self.model_name]["completion_tokens"] + inference_execute_token_stats = {} + return response + + + + + diff --git a/methods/ADAS/adas_math_async.py b/methods/ADAS/adas_math_async.py new file mode 100644 index 0000000..bd5385c --- /dev/null +++ b/methods/ADAS/adas_math_async.py @@ -0,0 +1,392 @@ +import argparse +import asyncio +import copy +import json +import numpy as np +import openai +import os +import random +import time + +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from termcolor import colored +from tqdm import tqdm + +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import random_id, bootstrap_confidence_interval +from .prompt.main_prompt import get_init_archive, get_prompt, get_reflexion_prompt +from .evaluate_math_aflow import grade_answer + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" +optimize_execute_token_stats = {} +inference_execute_token_stats = {} +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True + +client = openai.OpenAI(api_key='Your api key', base_url='Your base url') + +def get_json_response_from_gpt(msg,model,system_message,temperature=0): + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', model='gpt-4o-mini-2024-07-18', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_MATH(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + self.dataset_name = general_config['test_dataset_name'] + self.model_name_optimize = self.method_config.get('optimize_meta_model_name','gpt-4o-2024-08-06') + self.model_name_execute = self.method_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + global execute_model, api_key, base_url + execute_model = self.model_name_execute + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.inference_model = general_config['model_name'] + + self.valid_size = self.method_config["valid_size"] + self.test_size = self.method_config["test_size"] + self.shuffle_seed = self.method_config["shuffle_seed"] + self.n_repreat = self.method_config["n_repreat"] + self.multiprocessing = self.method_config["multiprocessing"] + self.max_workers = self.method_config["max_workers"] + self.debug = self.method_config["debug"] + + self.results_path = f"results/{self.dataset_name}/adas/{self.model_name_optimize}/{self.model_name_execute}" + self.n_generation = self.method_config["n_generation"] + self.debug_max = self.method_config["debug_max"] + self.args = argparse.Namespace(**self.method_config) + + + async def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = await super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages,model_name=self.model_name_optimize) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, val_dataset, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + # set seed 0 for valid set + random.seed(self.shuffle_seed) + + examples = random.sample(val_dataset, len(val_dataset)) + + if searching_mode: + examples = examples[:self.valid_size] * self.n_repreat + else: + examples = examples[self.valid_size:self.valid_size + self.test_size] * self.n_repreat + + questions = [example['query'] for example in examples] + answers = [example['solution'] for example in examples] + + print(f"problem length: {len(examples)}") + max_workers = min(len(examples), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in questions: + taskInfo = Info('task', 'User', q, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for i,result in enumerate(results): + if not isinstance(result,str): + result = result.content + if grade_answer(str(result), answers[i]): + acc_list.append(1) + else: + acc_list.append(0) + print(f"acc: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # The original paper did not use the MATH dataset, here we use AFlow's MATH_val. + optimized_path = Path(self.results_path) / "best_forward.txt" + if optimized_path.exists(): + print(colored("The optimal forward function already exists!\n","red")) + return + best_fitness = 0 + + + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + for solution in archive: + if 'fitness' in solution: + continue + + solution['generation'] = "initial" + print(colored(f"============Initial Archive: {solution['name']}=================","yellow")) + # self.dynamic_forward(solution["code"]) + try: + acc_list = self.evaluate_forward_fn(solution["code"],val_dataset) + except Exception as e: + print("During evaluating initial archive:") + print(e) + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + solution['fitness'] = fitness_str + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = solution + archive_path = os.path.join(self.results_path,"archive.json") + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + with open(archive_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + print(colored(f"============Generation {n + 1}=================","light_cyan")) + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + try: + msg_list = loop.run_until_complete(self.optimize(msg_list,archive,n)) + except Exception as e: + print(f"During optimizing:\n{e}") + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = asyncio.run(self.call_llm(msg_list)) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + archive_path = os.path.join(self.results_path,"archive.json") + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + with open(archive_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + + + forward_path = os.path.join(self.results_path,"best_forward.txt") + with open(forward_path,"w") as f: + f.write(best_solution['code']) + print(colored("Optimization complete!","green")) + print(colored(f"\n>> Optimization token stats: {self.get_token_stats()}","light_yellow")) + token_path = os.path.join(self.results_path,"api_token.json") + with open(token_path,"a") as f: + json.dump(optimize_execute_token_stats, f, indent=4) + + async def optimize(self,msg_list,archive,n): + next_solution = await self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = asyncio.run(self.call_llm(messages=msg_list)) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = asyncio.run(self.call_llm(messages=msg_list)) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + return + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + optimized_path = Path(self.results_path) / "best_forward.txt" + if optimized_path.exists(): + with open(optimized_path,"r") as f: + forward_str = f.read() + else: + raise NotImplementedError("Best_forward function does not exist!") + + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + taskInfo = Info('task', 'User', query, -1) + agentSystem = AgentSystem() + response = agentSystem.forward(taskInfo) + if not isinstance(response,str): + response = response.content + + self.token_stats[self.model_name]["num_llm_calls"] = inference_execute_token_stats[self.model_name]["num_llm_calls"] + self.token_stats[self.model_name]["prompt_tokens"] = inference_execute_token_stats[self.model_name]["prompt_tokens"] + self.token_stats[self.model_name]["completion_tokens"] = inference_execute_token_stats[self.model_name]["completion_tokens"] + return response + + + + + diff --git a/methods/ADAS/adas_mgsm.py b/methods/ADAS/adas_mgsm.py new file mode 100644 index 0000000..b6572ac --- /dev/null +++ b/methods/ADAS/adas_mgsm.py @@ -0,0 +1,399 @@ +import argparse +import copy +import json +import numpy as np +import openai +import os +import random + +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from tqdm import tqdm + +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import random_id, bootstrap_confidence_interval, score_mgsm +from .prompt.mgsm_prompt import get_init_archive, get_prompt, get_reflexion_prompt + + + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" + +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True +execute_model = None +api_key = "Your api key" +base_url = "Your base url" +optimize_execute_token_stats = {} +inference_execute_token_stats = {} + +def merge_token_stats(target, *sources): + for d in sources: + for model_key, stats in d.items(): + for k, v in stats.items(): + target.setdefault(model_key, {}).setdefault(k, 0) + target[model_key][k] += v + return target + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + """ + Attributes: + """ + + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = execute_model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + self.client = openai.OpenAI(api_key=api_key, base_url=base_url) + def get_json_response_from_gpt( + self, + msg, + model, + system_message, + temperature=0 + ): + response = self.client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}. Return ONLY the number, i.e. 121 or 9." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = self.get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_MGSM(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + # set the meta model and execute model for optimizing mode + + self.model_api_config = general_config["model_api_config"] + self.optimize_execute_model = general_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.optimize_meta_model = general_config.get('optimize_meta_model_name','gpt-4o') + global execute_model, api_key, base_url + execute_model = self.optimize_execute_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + self.inference_model = general_config['model_name'] + + self.config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.domain = method_config_name + self.valid_size = self.config["valid_size"] + self.test_size = self.config["test_size"] + self.shuffle_seed = self.config["shuffle_seed"] + self.n_repreat = self.config["n_repreat"] + self.multiprocessing = self.config["multiprocessing"] + self.debug = self.config["debug"] + + self.save_dir = Path(__file__).parents[2] / "results" / self.domain / f"adas_{self.optimize_meta_model}_optimize_{self.optimize_execute_model}_execute" + self.optimizing_path = self.save_dir / "archive.json" + self.inference_path = self.save_dir / "best_workflow.json" + self.n_generation = self.config["n_generation"] + self.debug_max = self.config["debug_max"] + self.args = argparse.Namespace(**self.config) + + + def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages, model_name=self.optimize_meta_model) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, val_dataset, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + # set seed 0 for valid set + # examples = get_all_examples() + random.seed(self.shuffle_seed) + examples = random.sample(val_dataset, len(val_dataset)) + random.shuffle(examples) + + if searching_mode: + examples = examples[:self.valid_size] * self.n_repreat + else: + examples = examples[self.valid_size:self.valid_size + self.test_size] * self.n_repreat + + questions = ['Solve this math problem.\n' + example['query'] for example in examples] + answers = [example['answer_number'] for example in examples] + + print(f"problem length: {len(examples)}") + max_workers = min(len(examples), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in questions: + taskInfo = Info('task', 'User', q, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for q_idx, res in enumerate(results): + try: + if isinstance(res, Info): + extracted_answer = res.content + else: + extracted_answer = res + extracted_answer = int(extracted_answer.strip()) + correct_answer = answers[q_idx] + correct = bool(extracted_answer == correct_answer) + # correct = score_mgsm(correct_answer, extracted_answer) + except Exception as e: + acc_list.append(0) + continue + + acc_list.append(1 if correct else 0) + + print(f"acc: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # create save dir + os.makedirs(os.path.dirname(self.optimizing_path), exist_ok=True) + os.makedirs(os.path.dirname(self.inference_path), exist_ok=True) + + best_fitness = 0 + # training + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + for solution in archive: + if 'fitness' in solution: + continue + + solution['generation'] = "initial" + print(f"============Initial Archive: {solution['name']}=================") + # self.dynamic_forward(solution["code"]) + try: + acc_list = self.evaluate_forward_fn(solution["code"],val_dataset) + except Exception as e: + print("During evaluating initial archive:") + print(e) + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + solution['fitness'] = fitness_str + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = solution + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + print(f"============Generation {n + 1}=================") + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + try: + next_solution = self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = self.call_llm(messages=msg_list) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = self.call_llm(messages=msg_list) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + except Exception as e: + print("During optimizing:") + print(e) + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = self.call_llm(msg_list) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + with open(self.inference_path, 'w') as json_file: + json.dump(best_solution, json_file, indent=4) + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + if not os.path.exists(self.inference_path): + raise NotImplementedError("The specified best workflow path does not exist.") + with open(self.inference_path, 'r') as json_file: + best_solution = json.load(json_file) + + namespace = {} + exec(best_solution["code"], globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + agentSystem = AgentSystem() + + taskInfo = Info('task', 'User', query, -1) + response = agentSystem.forward(taskInfo) + if isinstance(response, str): + response = response + else: + response = response.content + + merge_token_stats(self.token_stats, optimize_execute_token_stats, inference_execute_token_stats) + + return response \ No newline at end of file diff --git a/methods/ADAS/adas_mmlu.py b/methods/ADAS/adas_mmlu.py new file mode 100644 index 0000000..d221ece --- /dev/null +++ b/methods/ADAS/adas_mmlu.py @@ -0,0 +1,423 @@ +import os +from ..mas_base import MAS +from ..utils import load_config +from .adas_utils import format_multichoice_question, random_id, bootstrap_confidence_interval + +import copy +import json +import os +import random +from pathlib import Path +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import openai +import pandas +import argparse +from tqdm import tqdm +from .prompt.mmlu_prompt import get_init_archive, get_prompt, get_reflexion_prompt + +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +FORMAT_INST = lambda request_keys: f"""Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY REQUEST FIELDS and ensure that your response is a well-formed JSON object!\n""" +ROLE_DESC = lambda role: f"You are a {role}." +SYSTEM_MSG = "" + +PRINT_LLM_DEBUG = False +SEARCHING_MODE = True +execute_model = None +api_key = "Your api key" +base_url = "Your base url" +optimize_execute_token_stats = {} +inference_execute_token_stats = {} + +def merge_token_stats(target, *sources): + for d in sources: + for model_key, stats in d.items(): + for k, v in stats.items(): + target.setdefault(model_key, {}).setdefault(k, 0) + target[model_key][k] += v + return target + +class AgentSystem(): + def __init__(self) -> None: + pass + +class LLMAgentBase(): + """ + Attributes: + """ + + def __init__(self, output_fields: list, agent_name: str, + role='helpful assistant', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + + self.role = role + self.model = execute_model + self.temperature = temperature + + # give each instance a unique id + self.id = random_id() + self.client = openai.OpenAI(api_key=api_key, base_url=base_url) + + def get_json_response_from_gpt( + self, + msg, + model, + system_message, + temperature=0 + ): + response = self.client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, max_tokens=4096, stop=None, response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + if isinstance(content, str) and SEARCHING_MODE: # in cases where response is None or an error message + if model not in optimize_execute_token_stats: + optimize_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + optimize_execute_token_stats[model]["num_llm_calls"] += 1 + optimize_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + optimize_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + elif isinstance(content, str) and not SEARCHING_MODE: + if model not in inference_execute_token_stats: + inference_execute_token_stats[model] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + inference_execute_token_stats[model]["num_llm_calls"] += 1 + inference_execute_token_stats[model]["prompt_tokens"] += num_prompt_tokens + inference_execute_token_stats[model]["completion_tokens"] += num_completion_tokens + # cost = response.usage.completion_tokens / 1000000 * 15 + response.usage.prompt_tokens / 1000000 * 5 + assert not json_dict is None + return json_dict + + def generate_prompt(self, input_infos, instruction) -> str: + # construct system prompt + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + # construct input infos text + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx + 1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> dict: + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + try: + response_json = {} + response_json = self.get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + assert len(response_json) == len(self.output_fields), "not returning enough fields" + except Exception as e: + # print(e) + if "maximum context length" in str(e) and SEARCHING_MODE: + raise AssertionError("The context is too long. Please try to design the agent to have shorter context.") + # try to fill in the missing field + for key in self.output_fields: + if not key in response_json and len(response_json) < len(self.output_fields): + response_json[key] = '' + for key in copy.deepcopy(list(response_json.keys())): + if len(response_json) > len(self.output_fields) and not key in self.output_fields: + del response_json[key] + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class ADAS_MMLU(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + # set the meta model and execute model for optimizing mode + self.optimize_execute_model = general_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.optimize_meta_model = general_config.get('optimize_meta_model_name','gpt-4o') + global execute_model, api_key, base_url + execute_model = self.optimize_execute_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + self.inference_model = general_config['model_name'] + + self.config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.domain = method_config_name + self.valid_size = self.config["valid_size"] + self.test_size = self.config["test_size"] + self.shuffle_seed = self.config["shuffle_seed"] + self.n_repreat = self.config["n_repreat"] + self.multiprocessing = self.config["multiprocessing"] + self.debug = self.config["debug"] + + self.save_dir = Path(__file__).parents[2] / "results" / self.domain / f"adas_{self.optimize_meta_model}_optimize_{self.optimize_execute_model}_execute" + self.optimizing_path = self.save_dir / "archive.json" + self.inference_path = self.save_dir / "best_workflow.json" + self.n_generation = self.config["n_generation"] + self.debug_max = self.config["debug_max"] + self.args = argparse.Namespace(**self.config) + + + def call_llm(self, prompt=None, system_prompt=None, messages=None): + response = super().call_llm(prompt=prompt, system_prompt=system_prompt, messages=messages, model_name=self.optimize_meta_model) + formatted_response = response.strip().replace('```json', '').replace('```', '') + return formatted_response + + def evaluate_forward_fn(self, forward_str, val_dataset, searching_mode=True): + # dynamically define forward() + # modified from https://github.com/luchris429/DiscoPOP/blob/main/scripts/launch_evo.py + namespace = {} + exec(forward_str, globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + # print(f"forward function defined:\n{forward_str}") + + LETTER_TO_INDEX = {'A': 0, 'B': 1, 'C': 2, 'D': 3} + # set seed 0 for valid set + random.seed(self.shuffle_seed) + examples = random.sample(val_dataset, len(val_dataset)) + random.shuffle(examples) + + if searching_mode: + examples = examples[:self.valid_size] * self.n_repreat + else: + examples = examples[self.valid_size:self.valid_size + self.test_size] * self.n_repreat + + questions = [format_multichoice_question(example) for example in examples] + answers = [example['answer'] for example in examples] + + print(f"problem length: {len(examples)}") + max_workers = min(len(examples), self.max_workers) if self.multiprocessing else 1 + + task_queue = [] + for q in questions: + taskInfo = Info('task', 'User', q, -1) + task_queue.append(taskInfo) + + agentSystem = AgentSystem() + + acc_list = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(tqdm(executor.map(agentSystem.forward, task_queue), total=len(task_queue))) + + for q_idx, res in enumerate(results): + try: + if isinstance(res, str) and res in LETTER_TO_INDEX: + predicted_idx = LETTER_TO_INDEX[res] + elif 'A)' in res: + predicted_idx = 0 + elif 'B)' in res: + predicted_idx = 1 + elif 'C)' in res: + predicted_idx = 2 + elif 'D)' in res: + predicted_idx = 3 + elif isinstance(res, list): + try_res = res[1] + predicted_idx = LETTER_TO_INDEX[try_res.content] + elif res.content in LETTER_TO_INDEX: + predicted_idx = LETTER_TO_INDEX[res.content] + elif 'A)' in res.content: + predicted_idx = 0 + elif 'B)' in res.content: + predicted_idx = 1 + elif 'C)' in res.content: + predicted_idx = 2 + elif 'D)' in res.content: + predicted_idx = 3 + else: + print(f"error in q {q_idx}") + acc_list.append(0) + continue + except Exception as e: + acc_list.append(0) + continue + + if predicted_idx == answers[q_idx]: + acc_list.append(1) + else: + acc_list.append(0) + print(f"acc: {bootstrap_confidence_interval(acc_list)}") + return acc_list + + def optimizing(self, val_dataset): + # create save dir + os.makedirs(os.path.dirname(self.optimizing_path), exist_ok=True) + os.makedirs(os.path.dirname(self.inference_path), exist_ok=True) + + best_fitness = 0 + # training + # use validation dataset to optimize + archive = get_init_archive() + start = 0 + + # for solution in archive: + # if 'fitness' in solution: + # continue + + # solution['generation'] = "initial" + # print(f"============Initial Archive: {solution['name']}=================") + # # self.dynamic_forward(solution["code"]) + # try: + # acc_list = self.evaluate_forward_fn(solution["code"],data_path=val_dataset) + # except Exception as e: + # print("During evaluating initial archive:") + # print(e) + # continue + + # fitness, fitness_str = bootstrap_confidence_interval(acc_list) + # solution['fitness'] = fitness_str + # # save the agent with the highest median score + # if fitness > best_fitness: + # best_fitness = fitness + # best_solution = solution + + # with open(self.optimizing_path, 'w') as json_file: + # json.dump(archive, json_file, indent=4) + + for n in range(start, self.n_generation): + print(f"============Generation {n + 1}=================") + system_prompt, prompt = get_prompt(archive) + msg_list = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt} + ] + + try: + next_solution = self.call_llm(messages=msg_list) + print(next_solution) + Reflexion_prompt_1, Reflexion_prompt_2 = get_reflexion_prompt(archive[-1] if n > 0 else None) + # Reflexion 1 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_1}) + next_solution = self.call_llm(messages=msg_list) + + # Reflexion 2 + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": Reflexion_prompt_2}) + next_solution = self.call_llm(messages=msg_list) + next_solution = next_solution.strip().replace('```json', '').replace('```', ',') + next_solution = json.loads(next_solution) + except Exception as e: + print("During optimizing:") + print(e) + n -= 1 + continue + + acc_list = [] + for _ in range(self.debug_max): + try: + acc_list = self.evaluate_forward_fn(next_solution["code"], val_dataset) + if np.mean(acc_list) < 0.01 and SEARCHING_MODE: + raise Exception("All 0 accuracy") + break + except Exception as e: + print("During evaluation:") + print(e) + msg_list.append({"role": "assistant", "content": str(next_solution)}) + msg_list.append({"role": "user", "content": f"Error during evaluation:\n{e}\nCarefully consider where you went wrong in your latest implementation. Using insights from previous attempts, try to debug the current code to implement the same thought. Repeat your previous thought in 'thought', and put your thinking for debugging in 'debug_thought'"}) + try: + next_solution = self.call_llm(msg_list) + except Exception as e: + print("During LLM generate new solution:") + print(e) + continue + continue + if not acc_list: + n -= 1 + continue + + fitness, fitness_str = bootstrap_confidence_interval(acc_list) + next_solution['fitness'] = fitness_str + next_solution['generation'] = n + 1 + + if 'debug_thought' in next_solution: + del next_solution['debug_thought'] + if 'reflection' in next_solution: + del next_solution['reflection'] + print(next_solution['name']) + print(next_solution['code']) + + # save the agent with the highest median score + if fitness > best_fitness: + best_fitness = fitness + best_solution = next_solution + + archive.append(next_solution) + + with open(self.optimizing_path, 'w') as json_file: + json.dump(archive, json_file, indent=4) + + with open(self.inference_path, 'w') as json_file: + json.dump(best_solution, json_file, indent=4) + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + global execute_model, api_key, base_url, SEARCHING_MODE + SEARCHING_MODE = False + execute_model = self.inference_model + self.execute_model_dict = self.model_api_config[execute_model]['model_list'][0] + api_key = self.execute_model_dict['api_key'] + base_url = self.execute_model_dict['model_url'] + self.max_workers = self.model_api_config[execute_model]["max_workers"] + if not os.path.exists(self.inference_path): + raise NotImplementedError("The specified best workflow path does not exist.") + with open(self.inference_path, 'r') as json_file: + best_solution = json.load(json_file) + # print(best_solution) + + namespace = {} + exec(best_solution["code"], globals(), namespace) + names = list(namespace.keys()) + if len(names) != 1: + raise AssertionError(f"{len(names)} things in namespace. Please only provide 1") + func = namespace[names[0]] + + if not callable(func): + raise AssertionError(f"{func} is not callable") + setattr(AgentSystem, "forward", func) + agentSystem = AgentSystem() + + taskInfo = Info('task', 'User', query, -1) + response = agentSystem.forward(taskInfo) + if isinstance(response, str): + response = response + else: + response = response.content + + merge_token_stats(self.token_stats, optimize_execute_token_stats, inference_execute_token_stats) + + return response \ No newline at end of file diff --git a/methods/ADAS/adas_utils.py b/methods/ADAS/adas_utils.py new file mode 100644 index 0000000..5ecca53 --- /dev/null +++ b/methods/ADAS/adas_utils.py @@ -0,0 +1,412 @@ +# demanded by adas +import random +import string +from collections import namedtuple +import numpy as np +import pandas as pd +import gzip +import json +import re + +from typing import Any, Dict, List, Set, Tuple, Union +from scipy.optimize import linear_sum_assignment + +Example = namedtuple('Example', ['question', 'choice1', 'choice2', 'choice3', 'choice4', 'correct_index']) + +def random_id(length=4): + characters = string.ascii_letters + string.digits # includes both upper/lower case letters and numbers + random_id = ''.join(random.choices(characters, k=length)) + return random_id + + +def bootstrap_confidence_interval(data, num_bootstrap_samples=100000, confidence_level=0.95): + """ + Calculate the bootstrap confidence interval for the mean of 1D accuracy data. + Also returns the median of the bootstrap means. + + Args: + - data (list or array of float): 1D list or array of data points. + - num_bootstrap_samples (int): Number of bootstrap samples. + - confidence_level (float): The desired confidence level (e.g., 0.95 for 95%). + + Returns: + - str: Formatted string with 95% confidence interval and median as percentages with one decimal place. + """ + # Convert data to a numpy array for easier manipulation + data = np.array(data) + + # List to store the means of bootstrap samples + bootstrap_means = [] + + # Generate bootstrap samples and compute the mean for each sample + for _ in range(num_bootstrap_samples): + # Resample with replacement + bootstrap_sample = np.random.choice(data, size=len(data), replace=True) + # Compute the mean of the bootstrap sample + bootstrap_mean = np.mean(bootstrap_sample) + bootstrap_means.append(bootstrap_mean) + + # Convert bootstrap_means to a numpy array for percentile calculation + bootstrap_means = np.array(bootstrap_means) + + # Compute the lower and upper percentiles for the confidence interval + lower_percentile = (1.0 - confidence_level) / 2.0 + upper_percentile = 1.0 - lower_percentile + ci_lower = np.percentile(bootstrap_means, lower_percentile * 100) + ci_upper = np.percentile(bootstrap_means, upper_percentile * 100) + + # Compute the median of the bootstrap means + median = np.median(bootstrap_means) + + # Convert to percentages and format to one decimal place + ci_lower_percent = ci_lower * 100 + ci_upper_percent = ci_upper * 100 + median_percent = median * 100 + + # Return the formatted string with confidence interval and median + return median_percent, f"95% Bootstrap Confidence Interval: ({ci_lower_percent:.1f}%, {ci_upper_percent:.1f}%), Median: {median_percent:.1f}%" + +# required by mmlu +QUERY_TEMPLATE_MULTICHOICE = """ +Answer the following multiple choice question. + +{Question} + +(A) {A} +(B) {B} +(C) {C} +(D) {D} +""".strip() + + +def format_multichoice_question(example): + question = example['query'] + options = example['choices'] + assert len(options) == 4,"4 options must be provided" + return QUERY_TEMPLATE_MULTICHOICE.format( + Question=question, + A=options[0], + B=options[1], + C=options[2], + D=options[3] + ) + +# required by mgsm +LANG_TO_INSTRUCTIONS = { + "en": """Solve this math problem. + +{input}""", + "bn": """এই গণিতের সমস্যাটি সমাধান করুন। + +{input}""", + "de": """Löse dieses Mathematikproblem. + +{input}""", + "es": """Resuelve este problema matemático. + +{input}""", + "fr": """Résolvez ce problème de mathématiques. + +{input}""", + "ja": """この数学の問題を解いてください。 + +{input}""", + "ru": """Решите эту математическую задачу. + +{input}""", + "sw": """Suluhisha tatizo hili la hesabu. + +{input}""", + "te": """ఈ గణిత సమస్యను పరిష్కరించండి. + +{input}""", + "th": """แก้ปัญหาคณิตศาสตร์นี้ + +{input}""", + "zh": """解决这个数学问题。 + +{input}""" +} + +LANG_TO_FPATH = lambda lang: f"dataset/mgsm/mgsm_{lang}.tsv" + +ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"] + + +def score_mgsm(target: str, prediction: str) -> bool: + if "." in prediction: + prediction = prediction.rstrip("0").rstrip(".") + + target = target.replace(",", "") + prediction = prediction.replace(",", "") + + return target == prediction + + +def get_lang_examples(lang: str) -> list[dict[str, str]]: + fpath = LANG_TO_FPATH(lang) + examples = [] + with open(fpath, mode='r', encoding='utf-8') as f: + for line in f: + inputs, targets = line.strip().split("\t") + if "." in targets: + raise ValueError(f"targets {targets} contains a decimal point.") + # targets = int(targets.replace(",", "")) + examples.append({"inputs": LANG_TO_INSTRUCTIONS[lang].format(input=inputs), "targets": targets, "lang": lang}) + return examples + + +def get_all_examples() -> list[dict[str, str]]: + examples = [] + for lang in ALL_LANGUAGES: + # if lang != "en": + # continue + examples += get_lang_examples(lang) + return examples + +# required by gpqa +def load_questions(path: str, seed: int) -> List[Example]: + """Load questions from csv file and return a list of Example namedtuples.""" + question_df = pd.read_csv(path) + random.seed(seed) + + def shuffle_choices_and_create_example(row) -> Example: + list_choices = [row['Incorrect Answer 1'], row['Incorrect Answer 2'], row['Incorrect Answer 3'], row['Correct Answer']] + random.shuffle(list_choices) + example = Example(row.Question, + list_choices[0], + list_choices[1], + list_choices[2], + list_choices[3], + list_choices.index(row['Correct Answer']) + ) + return example + + return [shuffle_choices_and_create_example(row) for _, row in question_df.iterrows()] + +# required by drop +def _remove_articles(text: str) -> str: + regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) + return re.sub(regex, " ", text) + + +def _white_space_fix(text: str) -> str: + return " ".join(text.split()) + + +EXCLUDE = set(string.punctuation) + + +def _remove_punc(text: str) -> str: + if not _is_number(text): + return "".join(ch for ch in text if ch not in EXCLUDE) + else: + return text + + +def _lower(text: str) -> str: + return text.lower() + + +def _tokenize(text: str) -> List[str]: + return re.split(" |-", text) + + +def _normalize_answer(text: str) -> str: + """Lower text and remove punctuation, articles and extra whitespace.""" + + parts = [ + _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token))))) + for token in _tokenize(text) + ] + parts = [part for part in parts if part.strip()] + normalized = " ".join(parts).strip() + return normalized + + +def _is_number(text: str) -> bool: + try: + float(text) + return True + except ValueError: + return False + + +def _normalize_number(text: str) -> str: + if _is_number(text): + return str(float(text)) + else: + return text + + +def _answer_to_bags( + answer: Union[str, List[str], Tuple[str, ...]] +) -> Tuple[List[str], List[Set[str]]]: + if isinstance(answer, (list, tuple)): + raw_spans = answer + else: + raw_spans = [answer] + normalized_spans: List[str] = [] + token_bags = [] + for raw_span in raw_spans: + normalized_span = _normalize_answer(raw_span) + normalized_spans.append(normalized_span) + token_bags.append(set(normalized_span.split())) + return normalized_spans, token_bags + + +def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]: + """ + Takes gold and predicted answer sets and first finds the optimal 1-1 alignment + between them and gets maximum metric values over all the answers. + """ + scores = np.zeros([len(gold), len(predicted)]) + for gold_index, gold_item in enumerate(gold): + for pred_index, pred_item in enumerate(predicted): + if _match_numbers_if_present(gold_item, pred_item): + scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) + row_ind, col_ind = linear_sum_assignment(-scores) + + max_scores = np.zeros([max(len(gold), len(predicted))]) + for row, column in zip(row_ind, col_ind): + max_scores[row] = max(max_scores[row], scores[row, column]) + return max_scores + + +def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float: + intersection = len(gold_bag.intersection(predicted_bag)) + if not predicted_bag: + precision = 1.0 + else: + precision = intersection / float(len(predicted_bag)) + if not gold_bag: + recall = 1.0 + else: + recall = intersection / float(len(gold_bag)) + f1 = ( + (2 * precision * recall) / (precision + recall) + if not (precision == 0.0 and recall == 0.0) + else 0.0 + ) * 100 + return f1 + + +def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool: + gold_numbers = set() + predicted_numbers = set() + for word in gold_bag: + if _is_number(word): + gold_numbers.add(word) + for word in predicted_bag: + if _is_number(word): + predicted_numbers.add(word) + if (not gold_numbers) or gold_numbers.intersection(predicted_numbers): + return True + return False + + +def get_drop_metrics( + predicted: Union[str, List[str], Tuple[str, ...]], gold: Union[str, List[str], Tuple[str, ...]] +) -> Tuple[float, float]: + """ + Takes a predicted answer and a gold answer (that are both either a string or a list of + strings), and returns exact match and the DROP F1 metric for the prediction. If you are + writing a script for evaluating objects in memory (say, the output of predictions during + validation, or while training), this is the function you want to call, after using + :func:`answer_json_to_strings` when reading the gold answer from the released data file. + """ + predicted_bags = _answer_to_bags(predicted) + gold_bags = _answer_to_bags(gold) + + if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]): + exact_match = 1.0 + else: + exact_match = 0.0 + + f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1]) + f1 = np.mean(f1_per_bag) + f1 = round(f1, 2) + return exact_match, f1 + + +def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]: + """ + Takes an answer JSON blob from the DROP data release and converts it into strings used for + evaluation. + """ + if "number" in answer and answer["number"]: + return tuple([str(answer["number"])]), "number" + elif "spans" in answer and answer["spans"]: + return tuple(answer["spans"]), "span" if len(answer["spans"]) == 1 else "spans" + elif "date" in answer: + return ( + tuple( + [ + "{0} {1} {2}".format( + answer["date"]["day"], answer["date"]["month"], answer["date"]["year"] + ).strip() + ] + ), + "date", + ) + else: + raise ValueError( + f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}" + ) + + +def answer_json_to_string(answer_json): + return json.dumps(answer_json_to_strings(answer_json)) + + +def normalize(s: str) -> str: + """Lower text and remove punctuation, articles and extra whitespace.""" + s = s.lower() + exclude = set(string.punctuation) + s = "".join(char for char in s if char not in exclude) + s = re.sub(r"\b(a|an|the)\b", " ", s) + s = " ".join(s.split()) + return s + + +def fuzzy_match(s1: str, s2: str) -> bool: + s1 = normalize(s1) + s2 = normalize(s2) + + if s1 == "" or s2 == "": + return s1 == s2 + + return s1 in s2 or s2 in s1 + + +def drop_metric(sample: str, reference: list[str]) -> Tuple[float, float]: + em_scores = [] + f1_scores = [] + for answer in reference: + if answer.strip() != "": + em, f1 = get_drop_metrics(sample, answer) + em_scores.append(em) + f1_scores.append(f1) + return (max(em_scores), max(f1_scores)) + + +def load_drop(file_path): + with gzip.open(file_path, mode="rb") as f: + test_samples = [json.loads(line) for line in f] + prompt = """You will be asked to read a passage and answer a question.\n""" + few_shot_prompt = """You will be asked to read a passage and answer a question. + +# Examples: +Passage: As of the census of 2000, there were 952 people, 392 households, and 241 families residing in the village. The population density was 952.9 people per square mile (367.6/km²). There were 449 housing units at an average density of 449.4 per square mile (173.4/km²). The racial makeup of the village was 96.11% White (U.S. Census), 0.95% African American (U.S. Census) or Race (United States Census), 0.11% Native American (U.S. Census), 0.11% Asian (U.S. Census), 0.21% from Race (United States Census), and 2.52% from two or more races. 1.05% of the population were Hispanics in the United States or Latino (U.S. Census) of any race.\nQuestion: How many more people, in terms of percentage, were from two or more races compared to being solely Native American or solely Asian?\nAnswer: 2.3 + +# Your Task +--- + +""" + examples = [] + for sample in test_samples: + sample['inputs'] = few_shot_prompt + sample['context'] + sample['targets'] = sample["ref_text"].split("|") + examples.append(sample) + return examples + diff --git a/methods/ADAS/configs/config.yaml b/methods/ADAS/configs/config.yaml new file mode 100644 index 0000000..fbbd114 --- /dev/null +++ b/methods/ADAS/configs/config.yaml @@ -0,0 +1,10 @@ +valid_size: 128 +test_size: 800 +shuffle_seed: 0 +n_repreat: 1 +multiprocessing: true +max_workers: 48 +debug: true +expr_name: "math_gpt3.5_results" +n_generation: 3 +debug_max: 3 \ No newline at end of file diff --git a/methods/ADAS/configs/drop.yaml b/methods/ADAS/configs/drop.yaml new file mode 100644 index 0000000..4163441 --- /dev/null +++ b/methods/ADAS/configs/drop.yaml @@ -0,0 +1,10 @@ +valid_size: 128 +test_size: 800 +shuffle_seed: 0 +n_repreat: 1 +multiprocessing: true +max_workers: 48 +debug: true +expr_name: "mmlu_gpt3.5_results" +n_generation: 10 +debug_max: 3 \ No newline at end of file diff --git a/methods/ADAS/configs/gpqa.yaml b/methods/ADAS/configs/gpqa.yaml new file mode 100644 index 0000000..4163441 --- /dev/null +++ b/methods/ADAS/configs/gpqa.yaml @@ -0,0 +1,10 @@ +valid_size: 128 +test_size: 800 +shuffle_seed: 0 +n_repreat: 1 +multiprocessing: true +max_workers: 48 +debug: true +expr_name: "mmlu_gpt3.5_results" +n_generation: 10 +debug_max: 3 \ No newline at end of file diff --git a/methods/ADAS/configs/mgsm.yaml b/methods/ADAS/configs/mgsm.yaml new file mode 100644 index 0000000..4163441 --- /dev/null +++ b/methods/ADAS/configs/mgsm.yaml @@ -0,0 +1,10 @@ +valid_size: 128 +test_size: 800 +shuffle_seed: 0 +n_repreat: 1 +multiprocessing: true +max_workers: 48 +debug: true +expr_name: "mmlu_gpt3.5_results" +n_generation: 10 +debug_max: 3 \ No newline at end of file diff --git a/methods/ADAS/configs/mmlu.yaml b/methods/ADAS/configs/mmlu.yaml new file mode 100644 index 0000000..82a74a0 --- /dev/null +++ b/methods/ADAS/configs/mmlu.yaml @@ -0,0 +1,8 @@ +valid_size: 128 +test_size: 800 +shuffle_seed: 0 +n_repreat: 1 +multiprocessing: true +debug: true +n_generation: 1 +debug_max: 3 \ No newline at end of file diff --git a/methods/ADAS/evaluate_math_aflow.py b/methods/ADAS/evaluate_math_aflow.py new file mode 100644 index 0000000..85de810 --- /dev/null +++ b/methods/ADAS/evaluate_math_aflow.py @@ -0,0 +1,97 @@ +import regex,re +from termcolor import colored +from typing import Any +from math import isclose +from sympy import N, simplify +from sympy.parsing.latex import parse_latex +from sympy.parsing.sympy_parser import parse_expr + +def extract_model_answer(text: str) -> str: + pattern = r"\\boxed{((?:[^{}]|{[^{}]*})*)}" + boxed_matches = re.findall(pattern, text, re.DOTALL) + if boxed_matches: + return boxed_matches[-1].strip() + + sentence_end_pattern = r"(? bool: + if str(prediction) == str(reference): + return True + try: + if is_digit(prediction) and is_digit(reference): + prediction = parse_digits(prediction) + reference = parse_digits(reference) + return isclose(prediction, reference, abs_tol=1e-3) + except: + pass + + try: + return symbolic_equal(prediction, reference) + except: + pass + return False + +def is_digit(num): + return parse_digits(num) is not None + +def parse_digits(num): + num = regex.sub(",", "", str(num)) + try: + return float(num) + except: +# When the original input is a percentage in LaTeX format (e.g., 50\%), +# a backslash remains after processing, causing the float conversion to +# fail returning None, and subsequent math operations may produce type errors. +# num = num.replace("\\%", "").replace("%", "") + if num.endswith("%"): + num = num[:-1] + if num.endswith("\\"): + num = num[:-1] + try: + return float(num) / 100 + except: + pass + return None + +def symbolic_equal(a, b): + def _parse(s): + for f in [parse_latex, parse_expr]: + try: + return f(s) + except: + pass + return s + + a = _parse(a) + b = _parse(b) + + try: + if simplify(a - b) == 0: + return True + except: + pass + + try: + if isclose(N(a), N(b), abs_tol=1e-3): + return True + except: + pass + return False + diff --git a/methods/ADAS/prompt/drop_prompt.py b/methods/ADAS/prompt/drop_prompt.py new file mode 100644 index 0000000..2ef398a --- /dev/null +++ b/methods/ADAS/prompt/drop_prompt.py @@ -0,0 +1,544 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Reading Comprehension Specialist', 'Logical Reasoning Strategist', 'Multidisciplinary Knowledge Integrator']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Reading Comprehension Specialist', 'Logical Reasoning Strategist', 'Multidisciplinary Knowledge Integrator', 'Helpful Assistant']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Reading Comprehension Specialist, Logical Reasoning Strategist, and Multidisciplinary Knowledge Integrator." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'specialist' in choice.content.lower(): + expert_id = 0 + elif 'strategist' in choice.content.lower(): + expert_id = 1 + elif 'integrator' in choice.content.lower(): + expert_id = 2 + else: + expert_id = 3 # Default to helpful assistant + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs (DROP), which assesses the ability to perform discrete reasoning and comprehend detailed information across multiple paragraphs. + +## An example question from DROP: + +You will be asked to read a passage and answer a question. +Passage: +Non-nationals make up more than half of the population of Bahrain, with immigrants making up about 55% of the overall population. Of those, the vast majority come from South and Southeast Asia: according to various media reports and government statistics dated between 2005-2009 roughly 290,000 Indians, 125,000 Bangladeshis, 45,000 Pakistanis, 45,000 Filipinos, and 8,000 Indonesians.\nQuestion: What two nationalities had the same number of people living in Bahrain between 2005-2009? +Answer [Not Given]: +Pakistanis and Filipinos + + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with prompting techniques and the agent works from the literature. Your goal is to maximize the specified performance metrics by proposing interestingly new agents. +Observe the discovered agents carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative when thinking about the next interesting agent to try. You are encouraged to draw inspiration from related agent papers or academic papers from other research areas. +Use the knowledge from the archive and inspiration from academic literature to propose the next interesting agentic system design. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points: + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/ADAS/prompt/gpqa_prompt.py b/methods/ADAS/prompt/gpqa_prompt.py new file mode 100644 index 0000000..9d8f3f8 --- /dev/null +++ b/methods/ADAS/prompt/gpqa_prompt.py @@ -0,0 +1,553 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Majority voting function to select the most common answer + from collections import Counter + def majority_voting(answers): + return Counter(answers).most_common(1)[0][0] + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.append(answer.content) + + # Ensembling the answers from multiple CoT agents + answer = majority_voting(possible_answers) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Biology Expert', 'Physics Expert', 'Chemistry Expert', 'Science Generalist']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Physics Expert', 'Chemistry Expert', 'Biology Expert', 'Science Generalist']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Physics, Chemistry, Biology Expert, or Science Generalist." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'physics' in choice.content.lower(): + expert_id = 0 + elif 'chemistry' in choice.content.lower(): + expert_id = 1 + elif 'biology' in choice.content.lower(): + expert_id = 2 + else: + expert_id = 3 # Default to Science Generalist + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the GPQA (Graduate-Level Google-Proof Q&A Benchmark). This benchmark consists of challenging multiple-choice questions across the domains of biology, physics, and chemistry, designed by domain experts to ensure high quality and difficulty. + +## An example question from GPQA: + +Two quantum states with energies E1 and E2 have a lifetime of 10^-9 sec and 10^-8 sec, respectively. We want to clearly distinguish these two energy levels. Which one of the following options could be their energy difference so that they be clearly resolved? + +Answer choices: +10^-9 eV +10^-8 eV +10^-7 eV +10^-6 eV + +Correct answer [Not provided]: +10^-7 eV + +Explanation [Not provided]: +According to the uncertainty principle, Delta E* Delta t=hbar/2. Delta t is the lifetime and Delta E is the width of the energy level. With Delta t=10^-9 s==> Delta E1= 3.3 10^-7 ev. And Delta t=10^-11 s gives Delta E2=3.310^-8 eV. +Therefore, the energy difference between the two states must be significantly greater than 10^-7 ev. So the answer is 10^-4 ev. + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with LLM prompting techniques and LLM agent works from the literature. Your goal is to maximize "fitness" by proposing interestingly new agents. +Observe the discovered architectures carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative to think about the next interesting architecture to try. You are encouraged to draw inspiration from related LLM agent papers or academic papers from other research areas. +Using the knowledge learned from the archive and the inspiration from academic literature to give the next interesting architecture. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points:" + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/ADAS/prompt/main_prompt copy.py b/methods/ADAS/prompt/main_prompt copy.py new file mode 100644 index 0000000..3357b0c --- /dev/null +++ b/methods/ADAS/prompt/main_prompt copy.py @@ -0,0 +1,544 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Majority voting function to select the most common answer + from collections import Counter + def majority_voting(answers): + return Counter(answers).most_common(1)[0][0] + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.append(answer.content) + + # Ensembling the answers from multiple CoT agents + answer = majority_voting(possible_answers) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Math Professor', 'Calculus Tutor', 'Math Enthusiast','Data Science Mentor']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Math Professor', ' Calculus Tutor', 'Math Enthusiast', 'Helpful Assistant','Data Science Mentor']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Math Professor, Calculus Tutor, Math Enthusiast, Data Science Mentor." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'professor' in choice.content.lower(): + expert_id = 0 + elif 'tutor' in choice.content.lower(): + expert_id = 1 + elif 'enthusiast' in choice.content.lower(): + expert_id = 2 + elif 'mentor' in choice.content.lower(): + expert_id = 3 + else: + expert_id = 4 # Default to helpful assistant + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the MATH Benchmark which evaluates mathematical problem-solving abilities. + +## An example question from MATH: + +**Question**: A particular convex pentagon has two congruent, acute angles. The measure of each of the other interior angles is equal to the sum of the measures of the two acute angles. What is the common measure of the large angles, in degrees?" + +**Answer (Not Given)**: 135 + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with LLM prompting techniques and LLM agent works from the literature. Your goal is to maximize "fitness" by proposing interestingly new agents. +Observe the discovered architectures carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative to think about the next interesting architecture to try. You are encouraged to draw inspiration from related LLM agent papers or academic papers from other research areas. +Using the knowledge learned from the archive and the inspiration from academic literature to give the next interesting architecture. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points:" + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/ADAS/prompt/main_prompt.py b/methods/ADAS/prompt/main_prompt.py new file mode 100644 index 0000000..3357b0c --- /dev/null +++ b/methods/ADAS/prompt/main_prompt.py @@ -0,0 +1,544 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Majority voting function to select the most common answer + from collections import Counter + def majority_voting(answers): + return Counter(answers).most_common(1)[0][0] + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.append(answer.content) + + # Ensembling the answers from multiple CoT agents + answer = majority_voting(possible_answers) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Math Professor', 'Calculus Tutor', 'Math Enthusiast','Data Science Mentor']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Math Professor', ' Calculus Tutor', 'Math Enthusiast', 'Helpful Assistant','Data Science Mentor']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Math Professor, Calculus Tutor, Math Enthusiast, Data Science Mentor." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'professor' in choice.content.lower(): + expert_id = 0 + elif 'tutor' in choice.content.lower(): + expert_id = 1 + elif 'enthusiast' in choice.content.lower(): + expert_id = 2 + elif 'mentor' in choice.content.lower(): + expert_id = 3 + else: + expert_id = 4 # Default to helpful assistant + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the MATH Benchmark which evaluates mathematical problem-solving abilities. + +## An example question from MATH: + +**Question**: A particular convex pentagon has two congruent, acute angles. The measure of each of the other interior angles is equal to the sum of the measures of the two acute angles. What is the common measure of the large angles, in degrees?" + +**Answer (Not Given)**: 135 + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with LLM prompting techniques and LLM agent works from the literature. Your goal is to maximize "fitness" by proposing interestingly new agents. +Observe the discovered architectures carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative to think about the next interesting architecture to try. You are encouraged to draw inspiration from related LLM agent papers or academic papers from other research areas. +Using the knowledge learned from the archive and the inspiration from academic literature to give the next interesting architecture. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points:" + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/ADAS/prompt/mgsm_prompt.py b/methods/ADAS/prompt/mgsm_prompt.py new file mode 100644 index 0000000..44689a4 --- /dev/null +++ b/methods/ADAS/prompt/mgsm_prompt.py @@ -0,0 +1,542 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Majority voting function to select the most common answer + from collections import Counter + def majority_voting(answers): + return Counter(answers).most_common(1)[0][0] + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.append(answer.content) + + # Ensembling the answers from multiple CoT agents + answer = majority_voting(possible_answers) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Math Professor', 'Grade School Teacher', 'Math Enthusiast']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Math Professor', 'Grade School Teacher', 'Math Enthusiast', 'Helpful Assistant']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Math Professor, Grade School Teacher, Math Enthusiast." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'professor' in choice.content.lower(): + expert_id = 0 + elif 'teacher' in choice.content.lower(): + expert_id = 1 + elif 'enthusiast' in choice.content.lower(): + expert_id = 2 + else: + expert_id = 3 # Default to helpful assistant + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the Multilingual Grade School Math Benchmark (MGSM) which evaluates mathematical problem-solving abilities across various languages to ensure broad and effective multilingual performance. + +## An example question from MGSM: + +**Question**: この数学の問題を解いてください。\n\n近所では、ペットのウサギの数がペットの犬と猫を合わせた数よりも12匹少ない。犬1匹あたり2匹の猫がおり、犬の数は60匹だとすると、全部で近所には何匹のペットがいますか? + +**Answer (Not Given)**: 348 + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with LLM prompting techniques and LLM agent works from the literature. Your goal is to maximize "fitness" by proposing interestingly new agents. +Observe the discovered architectures carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative to think about the next interesting architecture to try. You are encouraged to draw inspiration from related LLM agent papers or academic papers from other research areas. +Using the knowledge learned from the archive and the inspiration from academic literature to give the next interesting architecture. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points:" + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/ADAS/prompt/mmlu_prompt.py b/methods/ADAS/prompt/mmlu_prompt.py new file mode 100644 index 0000000..45ea976 --- /dev/null +++ b/methods/ADAS/prompt/mmlu_prompt.py @@ -0,0 +1,547 @@ +import json + +EXAMPLE = { + "thought": "**Insights:**\nYour insights on what should be the next interesting agent.\n**Overall Idea:**\nyour reasoning and the overall concept behind the agent design.\n**Implementation:**\ndescribe the implementation step by step.", + "name": "Name of your proposed agent", + "code": """def forward(self, taskInfo): + # Your code here + return answer +""" +} + +COT = { + "thought": "By encouraging the LLM to think step by step rather than directly outputting an answer, chain-of-thought reasoning enables complex problem-solving through intermediate steps. This practice improves the model's ability to handle tasks that require deeper reasoning and provides insight into its decision-making process.", + "name": "Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for the Chain-of-Thought (CoT) approach + # It is an important practice that allows the LLM to think step by step before solving the task. + cot_instruction = "Please think step by step and then solve the task." + + # Instantiate a new LLM agent specifically for CoT + # To allow LLM thinking before answering, we need to set an additional output field 'thinking'. + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Prepare the inputs for the CoT agent + # The input should be a list of Info, and the first one is often the taskInfo + cot_agent_inputs = [taskInfo] + + # Get the response from the CoT agent + thinking, answer = cot_agent(cot_agent_inputs, cot_instruction) + + # Return only the final answer + return answer +""" +} + +COT_SC = {"thought": "While an LLM can arrive at the correct answer, its reasoning may vary. By repeatedly asking the same question with high temperature settings, we can generate different reasoning paths. We then combine multiple answers from these Chain-of-Thought (CoT) agents to produce a more accurate final answer through ensembling.", + "name": "Self-Consistency with Chain-of-Thought", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + N = 5 # Number of CoT agents + + # Initialize multiple CoT agents with a higher temperature for varied reasoning + cot_agents = [LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent', temperature=0.8) for _ in range(N)] + + # Majority voting function to select the most common answer + from collections import Counter + def majority_voting(answers): + return Counter(answers).most_common(1)[0][0] + + possible_answers = [] + for i in range(N): + thinking, answer = cot_agents[i]([taskInfo], cot_instruction) + possible_answers.append(answer.content) + + # Ensembling the answers from multiple CoT agents + answer = majority_voting(possible_answers) + return answer +""" + } + +Reflexion = { + "thought": "To enhance its performance, an LLM can iteratively improve its answer based on feedback. By reflecting on its previous attempts and incorporating feedback, the model can refine its reasoning and provide a more accurate solution.", + "name": "Self-Refine (Reflexion)", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for reflecting on previous attempts and feedback to improve + cot_reflect_instruction = "Given previous attempts and feedback, carefully consider where you could go wrong in your latest attempt. Using insights from previous attempts, try to solve the task better." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for providing feedback and correcting the answer + critic_instruction = "Please review the answer above and criticize on where might be wrong. If you are absolutely sure it is correct, output 'True' in 'correct'." + critic_agent = LLMAgentBase(['feedback', 'correct'], 'Critic Agent') + + N_max = 5 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + for i in range(N_max): + # Get feedback and correct status from the critic + feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) + if correct.content == 'True': + break + + # Add feedback to the inputs for the next iteration + cot_inputs.extend([thinking, answer, feedback]) + + # Reflect on previous attempts and refine the answer + thinking, answer = cot_agent(cot_inputs, cot_reflect_instruction, i + 1) + return answer +""" +} + +LLM_debate = { + "thought": "By letting different LLMs debate with each other, we can leverage their diverse perspectives to find better solutions for tasks.", + "name": "LLM Debate", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + debate_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for debating and updating the solution based on other agents' solutions + debate_instruction = "Given solutions to the problem from other agents, consider their opinions as additional advice. Please think carefully and provide an updated answer." + + # Initialize debate agents with different roles and a moderate temperature for varied reasoning + debate_agents = [LLMAgentBase(['thinking', 'answer'], 'Debate Agent', temperature=0.8, role=role) for role in ['Biology Expert', 'Physics Expert', 'Chemistry Expert', 'Science Generalist']] + + # Instruction for final decision-making based on all debates and solutions + final_decision_instruction = "Given all the above thinking and answers, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + max_round = 2 # Maximum number of debate rounds + all_thinking = [[] for _ in range(max_round)] + all_answer = [[] for _ in range(max_round)] + + # Perform debate rounds + for r in range(max_round): + for i in range(len(debate_agents)): + if r == 0: + thinking, answer = debate_agents[i]([taskInfo], debate_initial_instruction) + else: + input_infos = [taskInfo] + [all_thinking[r-1][i]] + all_thinking[r-1][:i] + all_thinking[r-1][i+1:] + thinking, answer = debate_agents[i](input_infos, debate_instruction) + all_thinking[r].append(thinking) + all_answer[r].append(answer) + + # Make the final decision based on all debate results and solutions + thinking, answer = final_decision_agent([taskInfo] + all_thinking[max_round-1] + all_answer[max_round-1], final_decision_instruction) + return answer +""" +} + +Take_a_step_back = {"thought": "Let LLM first think about the principles involved in solving this task which could be helpful. By understanding the underlying principles, the model can better reason through the problem and provide a more accurate solution.", + "name": "Step-back Abstraction", + "code": """def forward(self, taskInfo): + # Instruction for understanding the principles involved in the task + principle_instruction = "What are the physics, chemistry or biology principles and concepts involved in solving this task? First think step by step. Then list all involved principles and explain them." + + # Instruction for solving the task based on the principles + cot_instruction = "Given the question and the involved principle behind the question, think step by step and then solve the task." + + # Instantiate LLM agents + principle_agent = LLMAgentBase(['thinking', 'principle'], 'Principle Agent') + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Get the principles involved in the task + thinking, principle = principle_agent([taskInfo], principle_instruction) + + # Use the principles to solve the task + thinking, answer = cot_agent([taskInfo, thinking, principle], cot_instruction) + return answer +""" + } + +QD = {"thought": "Similar to Quality-Diversity methods, let LLM generate multiple diverse interesting solutions could help. By encouraging the model to explore different reasoning paths, we can increase the chances of finding the best solution.", + "name": "Quality-Diversity", + "code": """def forward(self, taskInfo): + # Instruction for initial reasoning + cot_initial_instruction = "Please think step by step and then solve the task." + + # Instruction for giving diverse answers + qd_instruction = "Given previous attempts, try to come up with another interesting way to solve the task." + cot_agent = LLMAgentBase(['thinking', 'answer'], 'Chain-of-Thought Agent') + + # Instruction for final decision-making based on collected reasoning and answers + final_decision_instruction = "Given all the above solutions, reason over them carefully and provide a final answer." + final_decision_agent = LLMAgentBase(['thinking', 'answer'], 'Final Decision Agent', temperature=0.1) + + N_max = 3 # Maximum number of attempts + + # Initial attempt + cot_inputs = [taskInfo] + possible_answers = [] + thinking, answer = cot_agent(cot_inputs, cot_initial_instruction, 0) + + # Add the answer to the list of possible answers + possible_answers.extend([thinking, answer]) + + for i in range(N_max): + # Reflect on previous attempts and generate another interesting answer + cot_inputs.extend([thinking, answer]) + + # Generate another interesting answer + thinking, answer = cot_agent(cot_inputs, qd_instruction, i + 1) + possible_answers.extend([thinking, answer]) + + # Make the final decision based on all generated answers + thinking, answer = final_decision_agent([taskInfo] + possible_answers, final_decision_instruction) + return answer +""" + } + +Role_Assignment = {"thought": "Similar to Auto-GPT and expert prompting, we can use dynamic control flow in the design to let the agent decide what expert we should use.", + "name": "Dynamic Assignment of Roles", + "code": """def forward(self, taskInfo): + # Instruction for step-by-step reasoning + cot_instruction = "Please think step by step and then solve the task." + expert_agents = [LLMAgentBase(['thinking', 'answer'], 'Expert Agent', role=role) for role in ['Physics Expert', 'Chemistry Expert', 'Biology Expert', 'Science Generalist']] + + # Instruction for routing the task to the appropriate expert + routing_instruction = "Given the task, please choose an Expert to answer the question. Choose from: Physics, Chemistry, Biology Expert, or Science Generalist." + routing_agent = LLMAgentBase(['choice'], 'Routing agent') + + # Get the choice of expert to route the task + choice = routing_agent([taskInfo], routing_instruction)[0] + + if 'physics' in choice.content.lower(): + expert_id = 0 + elif 'chemistry' in choice.content.lower(): + expert_id = 1 + elif 'biology' in choice.content.lower(): + expert_id = 2 + else: + expert_id = 3 # Default to Science Generalist + + thinking, answer = expert_agents[expert_id]([taskInfo], cot_instruction) + return answer +""" + } + +system_prompt = """You are a helpful assistant. Make sure to return in a WELL-FORMED JSON object.""" + +base = """# Overview +You are an expert machine learning researcher testing various agentic systems. Your objective is to design building blocks such as prompts and control flows within these systems to solve complex tasks. Your aim is to design an optimal agent performing well on the MMLU (Massive Multitask Language Understanding) benchmark, a challenging evaluation that assesses a model's ability to answer questions across a wide range of subjects and difficulty levels. It includes subjects from STEM, social sciences, humanities, and more. + +## An example question from MMLU: + +Answer the following multiple choice question. + +The constellation ... is a bright W-shaped constellation in the northern sky. + +(A) Centaurus +(B) Cygnus +(C) Cassiopeia +(D) Cepheus + +# The utility code: + +```python +from collections import namedtuple +from typing import Union +import numpy as np +import json + +import openai +import backoff +from utils import random_id + +# Initialize the OpenAI client +client = openai.OpenAI() + +# Named tuple for holding task information +Info = namedtuple('Info', ['name', 'author', 'content', 'iteration_idx']) + +# Format instructions for LLM response +FORMAT_INST = lambda request_keys: f"Reply EXACTLY with the following JSON format.\n{str(request_keys)}\nDO NOT MISS ANY FIELDS AND MAKE SURE THE JSON FORMAT IS CORRECT!\n" + +# Description of the role for the LLM +ROLE_DESC = lambda role: f"You are a {role}." + +@backoff.on_exception(backoff.expo, openai.RateLimitError) +def get_json_response_from_gpt(msg, model, system_message, temperature=0.5): + \""" + Function to get JSON response from GPT model. + + Args: + - msg (str): The user message. + - model (str): The model to use. + - system_message (str): The system message. + - temperature (float): Sampling temperature. + + Returns: + - dict: The JSON response. + \""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": msg}, + ], + temperature=temperature, + max_tokens=1024, + stop=None, + response_format={"type": "json_object"} + ) + content = response.choices[0].message.content + json_dict = json.loads(content) + return json_dict + +class LLMAgentBase: + \""" + Base class for an LLM agent. + + Attributes: + - output_fields (list): Fields expected in the output. + - agent_name (str): Name of the agent. + - role (str): Role description for the agent. + - model (str): Model to be used. (option. Keep it default.) + - temperature (float): Sampling temperature. + - id (str): Unique identifier for the agent instance. + \""" + + def __init__(self, output_fields: list, agent_name: str, role='helpful assistant', model='gpt-3.5-turbo-0125', temperature=0.5) -> None: + self.output_fields = output_fields + self.agent_name = agent_name + self.role = role + self.model = model + self.temperature = temperature + self.id = random_id() + + def generate_prompt(self, input_infos, instruction) -> str: + \""" + Generates a prompt for the LLM. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + + Returns: + - tuple: System prompt and user prompt. + + An example of a generated prompt: + "" + You are a helpful assistant. + + # Output Format: + Reply EXACTLY with the following JSON format. + ... + + # Your Task: + You will be given some number of paired example inputs and outputs. The outputs ... + + ### thinking #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### code #1 by Chain-of-Thought Agent hkFo (yourself): + ... + + ### answer by Chain-of-Thought Agent hkFo's code evaluator:... + + + # Instruction: + Please think step by step and then solve the task by writing the code. + "" + \""" + output_fields_and_description = {key: f"Your {key}." if not 'answer' in key else f"Your {key}. Return ONLY the alphabet choice, i.e. A or B or C or D." for key in self.output_fields} + system_prompt = ROLE_DESC(self.role) + "\n\n" + FORMAT_INST(output_fields_and_description) + + input_infos_text = '' + for input_info in input_infos: + if isinstance(input_info, Info): + (field_name, author, content, iteration_idx) = input_info + else: + continue + if author == self.__repr__(): + author += ' (yourself)' + if field_name == 'task': + input_infos_text += f'# Your Task:\n{content}\n\n' + elif iteration_idx != -1: + input_infos_text += f'### {field_name} #{iteration_idx+1} by {author}:\n{content}\n\n' + else: + input_infos_text += f'### {field_name} by {author}:\n{content}\n\n' + + prompt = input_infos_text + instruction + return system_prompt, prompt + + def query(self, input_infos: list, instruction, iteration_idx=-1) -> list[Info]: + \""" + Queries the LLM with provided input information and instruction. + + Args: + - input_infos (list): List of input information. + - instruction (str): Instruction for the task. + - iteration_idx (int): Iteration index for the task. + + Returns: + - output_infos (list[Info]): Output information. + \""" + system_prompt, prompt = self.generate_prompt(input_infos, instruction) + response_json = get_json_response_from_gpt(prompt, self.model, system_prompt, self.temperature) + + output_infos = [] + for key, value in response_json.items(): + info = Info(key, self.__repr__(), value, iteration_idx) + output_infos.append(info) + return output_infos + + def __repr__(self): + return f"{self.agent_name} {self.id}" + + def __call__(self, input_infos: list, instruction, iteration_idx=-1): + # Note: + # The output of the LLM is a list of Info. If you are only querying one output, you should access it with [0]. + # It is a good practice to always include 'thinking' in the output. + return self.query(input_infos, instruction, iteration_idx=iteration_idx) + +class AgentArchitecture: + \""" + Fill in your code here. + \""" + def forward(self, taskInfo) -> Union[Info, str]: + \""" + Placeholder method for processing task information. + + Args: + - taskInfo (Info): Task information. + + Returns: + - Answer (Union[Info, str]): Your FINAL Answer. Return either a namedtuple Info or a string of answers. + \""" + pass +``` +# Discovered architecture archive +Here is the archive of the discovered architectures: + +[ARCHIVE] + +The fitness value is the median and 95% Bootstrap Confidence Interval of the correct rate on a validation question set. Your GOAL is to maximize the "fitness". + +# Output Instruction and Example: +The first key should be ("thought"), and it should capture your thought process for designing the next function. In the "thought" section, first reason about what should be the next interesting agent to try, then describe your reasoning and the overall concept behind the agent design, and finally detail the implementation steps. +The second key ("name") corresponds to the name of your next agent architecture. +Finally, the last key ("code") corresponds to the exact “forward()” function in Python code that you would like to try. You must write a COMPLETE CODE in "code": Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets. + +Here is an example of the output format for the next agent architecture: + +[EXAMPLE] + +You must use the exact function interface used above. You need to specify the instruction, input information, and the required output fields for various LLM agents to do their specific part of the architecture. +Also, it could be helpful to set the LLM’s role and temperature to further control the LLM’s response. Note that the LLMAgentBase() will automatically parse the output and return a list of “Infos”. You can get the content by Infos.content. +DO NOT FORGET the taskInfo input to LLM if you think it is needed, otherwise LLM will not know about the task. + +## WRONG Implementation examples: +Here are some mistakes you may make: + +1. This is WRONG: ``` +feedback, correct = critic_agent([taskInfo, thinking, answer], critic_instruction, i) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +``` +It is wrong to use "Info('feedback', 'Critic Agent', thinking, 0)". The returned "feedback" from LLMAgentBase is already Info. + +2. This is WRONG: ``` +# Debugging: Log the generated answer +print('Generated Answer:', ...) +feedback_info = verifier_agent([taskInfo, Info('feedback', 'Critic Agent', thinking, 0)], verification_instruction) +if len(feedback_info) < 3: # Check if feedback_info has enough elements + return 'Error: Feedback info incomplete' +``` +First, the len(feedback_info) will not work. +Second, you should never return an error message. You should always return the best answer you can get. +Third, you should never print anything in the code. +Lastly, again, DO NOT CREATE Info object by yourself. + +3. This is WRONG: ``` +all_thinking = [] +all_answers = [] +for agent, role in zip(agents, roles): + outputs = agent([taskInfo], independent_reasoning_instruction.format(role=role)) + all_thinking.append(outputs[0].content) + all_answers.append(outputs[1].content) + +# Aggregate the reasoning paths and answers +aggregated_thinking = '\n'.join(all_thinking) +aggregated_answers = '\n'.join(all_answers) +``` +You SHOULD NOT extract the content from the Info object by yourself. You should use the Info object directly. If you want to aggregate the content, you should just put those Info objects into a list and then use the list as input to the next LLM agent. + +4. This is WRONG: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +response_infos = reasoning_agent([taskInfo] + ..., reasoning_instruction) + +# Extract the final answer from the response_infos +for info in response_infos: + if info.name == 'final_answer': + return info +# Fallback if no answer is found +return Info('answer', 'Final Decision Agent', 'No answer generated.', 0) +``` +You should not extract the final answer by yourself. You SHOULD directly return the answer Info. Also, you should always return the best answer you can get. +CORRECT example: ``` +reasoning_agent = LLMAgentBase(['thinking', 'answer'], 'Reasoning Agent') +thinking, answer = reasoning_agent([taskInfo] + ..., reasoning_instruction) +return answer +``` + +# Your task +You are deeply familiar with LLM prompting techniques and LLM agent works from the literature. Your goal is to maximize "fitness" by proposing interestingly new agents. +Observe the discovered architectures carefully and think about what insights, lessons, or stepping stones can be learned from them. +Be creative to think about the next interesting architecture to try. You are encouraged to draw inspiration from related LLM agent papers or academic papers from other research areas. +Using the knowledge learned from the archive and the inspiration from academic literature to give the next interesting architecture. +THINK OUTSIDE THE BOX. +""" + +Reflexion_prompt_1 = f""""[EXAMPLE]Carefully review the proposed new architecture and reflect on the following points:" + +1. **Interestingness**: Assess whether your proposed architecture is interesting or innovative compared to existing methods in the archive. If you determine that the proposed architecture is not interesting, suggest a new architecture that addresses these shortcomings. +- Make sure to check the difference between the proposed architecture and previous attempts. +- Compare the proposal and the architectures in the archive CAREFULLY, including their actual differences in the implementation. +- Decide whether the current architecture is innovative. +- USE CRITICAL THINKING! + +2. **Implementation Mistakes**: Identify any mistakes you may have made in the implementation. Review the code carefully, debug any issues you find, and provide a corrected version. REMEMBER checking "## WRONG Implementation examples" in the prompt. + +3. **Improvement**: Based on the proposed architecture, suggest improvements in the detailed implementation that could increase its performance or effectiveness. In this step, focus on refining and optimizing the existing implementation without altering the overall design framework, except if you want to propose a different architecture if the current is not interesting. +- Observe carefully about whether the implementation is actually doing what it is supposed to do. +- Check if there is redundant code or unnecessary steps in the implementation. Replace them with effective implementation. +- Try to avoid the implementation being too similar to the previous agent. + +And then, you need to improve or revise the implementation, or implement the new proposed architecture based on the reflection. + +Your response should be organized as follows: + +"reflection": Provide your thoughts on the interestingness of the architecture, identify any mistakes in the implementation, and suggest improvements. + +"thought": Revise your previous proposal or propose a new architecture if necessary, using the same format as the example response. + +"name": Provide a name for the revised or new architecture. (Don't put words like "new" or "improved" in the name.) + +"code": Provide the corrected code or an improved implementation. Make sure you actually implement your fix and improvement in this code. +""" + +Reflexion_prompt_2 = """Using the tips in "## WRONG Implementation examples" section, revise the code further. +Your response should be organized as follows: +Put your new reflection thinking in "reflection". Repeat the previous "thought" and "name", and update the corrected version of the code in "code". +""" + + +def get_init_archive(): + return [COT, COT_SC, Reflexion, LLM_debate, Take_a_step_back, QD, Role_Assignment] + + +def get_prompt(current_archive, adaptive=False): + archive_str = ",\n".join([json.dumps(sol) for sol in current_archive]) + archive_str = f"[{archive_str}]" + prompt = base.replace("[ARCHIVE]", archive_str) + prompt = prompt.replace("[EXAMPLE]", json.dumps(EXAMPLE)) + + return system_prompt, prompt + + +def get_reflexion_prompt(prev_example): + prev_example_str = "Here is the previous agent you tried:\n" + json.dumps(prev_example) + "\n\n" + r1 = Reflexion_prompt_1.replace("[EXAMPLE]", prev_example_str) if prev_example else Reflexion_prompt_1.replace("[EXAMPLE]", "") + return r1, Reflexion_prompt_2 \ No newline at end of file diff --git a/methods/aflow/__init__.py b/methods/aflow/__init__.py new file mode 100644 index 0000000..d71f1c9 --- /dev/null +++ b/methods/aflow/__init__.py @@ -0,0 +1,2 @@ +from .aflow_main import AFlow +from .aflow_math import AFlow_MATH \ No newline at end of file diff --git a/methods/aflow/aflow_main.py b/methods/aflow/aflow_main.py new file mode 100644 index 0000000..27eb700 --- /dev/null +++ b/methods/aflow/aflow_main.py @@ -0,0 +1,656 @@ +import copy +import importlib +import time,os,json,asyncio,re,random +import pandas as pd +import numpy as np +import shutil + +from termcolor import colored +from tqdm import tgrange +from pathlib import Path +from collections import defaultdict +from tqdm.asyncio import tqdm_asyncio +from typing import Dict,Any +from pydantic_core import to_jsonable_python + +from .all_prompt import * +from .evaluate import evaluate_math,evaluate_mbpp,extract_model_answer +from ..mas_base import MAS +from ..utils import load_config + +BENCHMARK={ + "math": { + "operators":["Custom", "ScEnsemble", "Programmer"], + "type":"math" + }, + "mbpp": { + "operators":["Custom", "CustomCodeGenerate", "ScEnsemble", "Test"], + "type":"code" + } +} + +class AFlow(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + method_config_name = "config" if method_config_name is None else method_config_name + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + self.dataset_name = general_config['test_dataset_name'] + self.model_name_optimize = self.method_config.get('optimize_meta_model_name','gpt-4o') + self.model_name_execute = self.method_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.sample = self.method_config['sample'] + self.max_rounds = self.method_config['max_rounds'] + self.validation_rounds = self.method_config['validation_rounds'] + self.earlystop = self.method_config['earlystop'] + self.root_path = str(os.path.relpath(Path(__file__).parent, start=os.getcwd())) + self.results_path = f"results/{self.dataset_name}/aflow/{self.model_name_optimize}/{self.model_name_execute}" + self.top_scores = [] + self.round = 1 + self.graph = None + + matches = re.findall('(math|mbpp)', self.dataset_name.lower(), flags=re.IGNORECASE) + if matches: + self.operators:list = BENCHMARK[matches[0]]["operators"] + self.type = BENCHMARK[matches[0]]["type"] + self.domain = matches[0] + else: + raise ValueError("Dataset not found!") + + results_path = Path(self.results_path) + if not results_path.exists(): + graph_path = Path(self.root_path) / "initial_workflows" / self.domain + results_path.mkdir(parents=True, exist_ok=True) + exp_path = os.path.join(self.results_path, "processed_experience.json") + res_path = os.path.join(self.results_path, "results.json") + with open(exp_path, 'w') as f: + pass + with open(res_path, 'w') as f: + pass + for item in graph_path.iterdir(): + dest = results_path / item.name + if item.is_dir(): + shutil.copytree(item, dest, dirs_exist_ok=True) + else: + shutil.copy2(item, dest) + + self.optimized_round = 1 + self.inference_flag = True + + + def inference(self, sample, entrypoint = ""): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + self.inference_flag = True + optimized_path = Path(self.results_path) / "best_workflow" + if optimized_path.exists(): + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.best_workflow.graph" + else: + raise NotImplementedError("Best_workflow path does not exist!") + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + + graph = self.graph(name="Optimized", env=self) + + if self.domain == "math": + response = asyncio.run(graph(problem=query)) + #print("Raw response: ",response) + response = extract_model_answer(response) + #print("Porcessed response: ",response) + else: + response = asyncio.run(graph(problem=query,entry_point=entrypoint)) + return response + + def optimizing(self,val_dataset): + self.inference_flag = False + + optimized_path = Path(self.results_path) / "best_workflow" + if optimized_path.exists(): + print(colored("The optimal graph already exists!\n","red")) + return + + print(colored("Start optimizing ...\n","yellow")) + for i in range(self.max_rounds): + try: + print(colored(f"{i+1} round of optimization...\n","light_cyan")) + score = asyncio.run(self._optimize_graph(val_dataset)) + except Exception as e: + print(f"Optimization failed: {e}") + score = None + self.round += 1 + print(f"Score for round {self.round}: {score}") + self.save_optimized_graph() + converged, convergence_round, final_round = self.check_convergence(top_k=3) + if self.earlystop and converged: + print(f"Convergence detected, occurred in round {convergence_round}, final round is {final_round}") + self.print_results() + break + time.sleep(5) + print(colored("Optimization complete!","green")) + print(colored(f"\n>> Optimization token stats: {self.get_token_stats()}","light_yellow")) + + async def _optimize_graph(self,val_dataset): + + validation_n = self.validation_rounds + graph_path = self.results_path + result_path = os.path.join(graph_path, "results.json") + data=[] + if os.path.exists(result_path): + with open(result_path, "r") as json_file: + try: + data = json.load(json_file) + except json.JSONDecodeError: + data = [] + else: + data = [] + if self.round == 1: + directory = os.path.join(graph_path, f"round_{self.round}") + os.makedirs(directory, exist_ok=True) + + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.round_{self.round}.graph" + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + avg_score = await self.evaluate_graph(directory, validation_n, data,val_dataset,True) + + + while True: + directory = os.path.join(graph_path, f"round_{self.round+1}") + os.makedirs(directory, exist_ok=True) + + #parent <- SelectParent(results) + top_rounds = self.get_top_rounds() + sample,_ = self.select_round(top_rounds) + + prompt, graph_load = self.read_graph_files(sample["round"], graph_path) + pattern = r"class Workflow:.+" + graph = re.findall(pattern, graph_load, re.DOTALL) + + #context <- LoadContext(parent,experiences) + processed_experience = self.load_experience() + experience = self.format_experience(processed_experience, sample["round"]) + + path = os.path.join(graph_path, "template/operator.json") + operators_description = "" + for id, operator in enumerate(self.operators): + with open(path, "r") as f: + operator_data = json.load(f) + matched_data = operator_data[operator] + desc = matched_data["description"] + interface = matched_data["interface"] + operator_description = f"{id+1}. {operator}: {desc}, with interface {interface})." + operators_description += f"{operator_description}\n" + + log_data = self.load_log(sample["round"]) + + graph_input = WORKFLOW_INPUT.format( + experience=experience, + score=sample["score"], + graph=graph[0], + prompt=prompt, + operator_description=operator_description, + type=self.type, + log=log_data, + ) + graph_system = WORKFLOW_OPTIMIZE_PROMPT.format(type=self.type) + graph_optimize_prompt = graph_input + WORKFLOW_CUSTOM_USE + graph_system + names = ["modification","graph","prompt"] + types = {"modification":str,"graph":str,"prompt":str} + examples = [] + for name in names: + examples.append(f"<{name}>content") + + example_str = "\n".join(examples) + graph_optimize_prompt += f""" + ### Response format (must be strictly followed): All content must be enclosed in the given XML tags, ensuring each opening has a corresponding closing , with no incomplete or self-closing tags allowed.\n + {example_str} + """ + response = self.call_llm(prompt=graph_optimize_prompt,model_name=self.model_name_optimize) + + response = self.xml_extract(response,names,types) + # Check if the modification meets the conditions + check = self.check_modification( + processed_experience, response["modification"], sample["round"] + ) + + # If `check` is True, break the loop; otherwise, regenerate the graph + if check: + break + + # Save the graph and evaluate + graph = WORKFLOW_TEMPLATE.format(graph=response["graph"], round=self.round + 1, dataset=self.dataset_name) + + with open(os.path.join(directory, "graph.py"), "w", encoding="utf-8") as file: + file.write(graph) + + with open(os.path.join(directory, "prompt.py"), "w", encoding="utf-8") as file: + file.write(response["prompt"]) + + with open(os.path.join(directory, "__init__.py"), "w", encoding="utf-8") as file: + file.write("") + experience = { + "father node": sample["round"], + "modification": response["modification"], + "before": sample["score"], + "after": None, + "succeed": None, + } + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.round_{self.round+1}.graph" + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + avg_score = await self.evaluate_graph(directory, validation_n, data,val_dataset) + + experience["after"] = avg_score + experience["succeed"] = bool(avg_score > experience["before"]) + folder_path = Path(os.path.join(directory, "experience.json")).parent + if not folder_path.exists(): + folder_path.mkdir(parents=True, exist_ok=True) + + with open(os.path.join(directory, "experience.json"), "w", encoding="utf-8") as fout: + json.dump(experience, fout, ensure_ascii=False, indent=4, default=to_jsonable_python) + return avg_score + + async def evaluate_graph(self, directory, validation_n, data, val_dataset,initial=False): + sum_score = 0 + max_concurrent_tasks = 50 + for i in range(validation_n): + graph = self.graph(name=self.dataset_name+f"/round_{self.round}", env=self) + semaphore = asyncio.Semaphore(max_concurrent_tasks) + tasks = [self._run_with_semaphore(semaphore, problem,directory,graph)for problem in val_dataset] + results = await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.dataset_name} problems", total=len(val_dataset)) + + columns = ["question", "prediction", "expected_output", "score"] + df = pd.DataFrame(results, columns=columns) + average_score = df["score"].mean() + cur_round = self.round + 1 if initial is False else self.round + new_data = {"round": cur_round, "score": average_score} + data.append(new_data) + + result_path = os.path.join(self.results_path, "results.json") + folder_path = Path(result_path).parent + if not folder_path.exists(): + folder_path.mkdir(parents=True, exist_ok=True) + + with open(result_path, "w", encoding="utf-8") as fout: + json.dump(data, fout, ensure_ascii=False, indent=4, default=to_jsonable_python) + sum_score += average_score + + return sum_score / validation_n + + async def _run_with_semaphore(self, semaphore, problem,log_path,graph): + async with semaphore: + if self.domain == "math": + return await evaluate_math(problem, graph,log_path) + else: + return await evaluate_mbpp(problem, graph,log_path) + + def check_convergence(self, top_k=3, z=0, consecutive_rounds=5): + result_file = os.path.join(self.results_path, "results.json") + with open(result_file, "r") as file: + self.data = json.load(file) + rounds = {} + for entry in self.data: + round_number = entry["round"] + score = entry["score"] + if round_number not in rounds: + rounds[round_number] = [] + rounds[round_number].append(score) + self.rounds = rounds + sorted_rounds = sorted(self.rounds.items(), key=lambda x: x[0]) + avg_scores = [] + stds = [] + for round_number, scores in sorted_rounds: + avg_scores.append(np.mean(scores)) + stds.append(np.std(scores)) + # If total rounds are not enough to calculate top_k+1 rounds, return not converged + if len(avg_scores) < top_k + 1: + return False, None, None + convergence_count = 0 # Convergence counter + previous_y = None # Y value of the previous round (average of top_k scores) + sigma_y_previous = None # Standard error of Y value from previous round + for i in range(len(avg_scores)): + # Dynamically select top_k from current round and all previous rounds + top_k_indices = np.argsort(avg_scores[: i + 1])[::-1][:top_k] # Select top k indices by descending average score + top_k_scores = [avg_scores[j] for j in top_k_indices] # Get list of top k scores + top_k_stds = [ + stds[j] for j in top_k_indices + ] # Get list of standard deviations corresponding to top k scores + # Calculate mean of top k scores for current round, i.e., y_current + y_current = np.mean(top_k_scores) + # Calculate standard error of y_current (sigma_y_current), representing score dispersion + sigma_y_current = np.sqrt(np.sum([s**2 for s in top_k_stds]) / (top_k**2)) + # If not the first round, calculate change in Y (Delta_Y) and corresponding standard error + if previous_y is not None: + # Calculate Y difference between current round and previous round + delta_y = y_current - previous_y + # Calculate standard error of Y difference (sigma_Delta_Y) + sigma_delta_y = np.sqrt(sigma_y_current**2 + sigma_y_previous**2) + # Check if Y change is within acceptable confidence interval, i.e., convergence condition + if abs(delta_y) <= z * sigma_delta_y: + convergence_count += 1 + # If consecutive converged rounds reach set value, return convergence information + if convergence_count >= consecutive_rounds: + return True, i - consecutive_rounds + 1, i + else: + # If change is large, reset convergence counter + convergence_count = 0 + # Update Y value and standard error for previous round + previous_y = y_current + sigma_y_previous = sigma_y_current + # If convergence condition not met, return not converged + return False, None, None + + def get_top_rounds(self): + rounds_dir = self.results_path + result_file = os.path.join(rounds_dir, "results.json") + self.top_scores = [] + if not Path(result_file).exists(): + raise FileNotFoundError(f"json_file: {result_file} not exist, return []") + with open(result_file, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {result_file} failed") + df = pd.DataFrame(data) + + scores_per_round = df.groupby("round")["score"].mean().to_dict() + + for round_number, average_score in scores_per_round.items(): + self.top_scores.append({"round": int(round_number), "score": average_score}) + + self.top_scores.sort(key=lambda x: x["score"], reverse=True) + + unique_rounds = set() + unique_top_scores = [] + + first_round = next((item for item in self.top_scores if item["round"] == 1), None) + if first_round: + unique_top_scores.append(first_round) + unique_rounds.add(1) + + for item in self.top_scores: + if item["round"] not in unique_rounds: + unique_top_scores.append(item) + unique_rounds.add(item["round"]) + + if len(unique_top_scores) >= self.sample: + break + + return unique_top_scores + + def select_round(self, items,alpha=0.2, lambda_=0.3): + + if not items: + raise ValueError("Item list is empty.") + + sorted_items = sorted(items, key=lambda x: x["score"], reverse=True) + scores = [item["score"] * 100 for item in sorted_items] + + scores = np.array(scores, dtype=np.float64) + n = len(scores) + + if n == 0: + raise ValueError("Score list is empty.") + + uniform_prob = np.full(n, 1.0 / n, dtype=np.float64) + + max_score = np.max(scores) + shifted_scores = scores - max_score + exp_weights = np.exp(alpha * shifted_scores) + + sum_exp_weights = np.sum(exp_weights) + if sum_exp_weights == 0: + raise ValueError("Sum of exponential weights is 0, cannot normalize.") + + score_prob = exp_weights / sum_exp_weights + + mixed_prob = lambda_ * uniform_prob + (1 - lambda_) * score_prob + + total_prob = np.sum(mixed_prob) + if not np.isclose(total_prob, 1.0): + mixed_prob = mixed_prob / total_prob + + + print(f"\nMixed probability distribution: {mixed_prob}") + print(f"\nSorted rounds: {sorted_items}") + + selected_index = np.random.choice(len(sorted_items), p=mixed_prob) + print(f"\nSelected index: {selected_index}, Selected item: {sorted_items[selected_index]}") + + return sorted_items[selected_index],sorted_items + + def read_graph_files(self, round_number: int, workflows_path: str): + prompt_file_path = os.path.join(workflows_path, f"round_{round_number}", "prompt.py") + graph_file_path = os.path.join(workflows_path, f"round_{round_number}", "graph.py") + + try: + with open(prompt_file_path, "r", encoding="utf-8") as file: + prompt_content = file.read() + with open(graph_file_path, "r", encoding="utf-8") as file: + graph_content = file.read() + except FileNotFoundError as e: + print(f"Error: File not found for round {round_number}: {e}") + raise + except Exception as e: + print(f"Error loading prompt for round {round_number}: {e}") + raise + return prompt_content, graph_content + + def load_experience(self): + rounds_dir = os.path.normpath(self.results_path) + experience_data = defaultdict(lambda: {"score": None, "success": {}, "failure": {}}) + + for round_dir in os.listdir(rounds_dir): + if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"): + round_path = os.path.join(rounds_dir, round_dir) + try: + round_number = int(round_dir.split("_")[1]) + json_file_path = os.path.join(round_path, "experience.json") + if os.path.exists(json_file_path): + if not Path(json_file_path).exists(): + raise FileNotFoundError(f"json_file: {json_file_path} not exist, return []") + with open(json_file_path, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {json_file_path} failed") + + father_node = data["father node"] + + if experience_data[father_node]["score"] is None: + experience_data[father_node]["score"] = data["before"] + + if data["succeed"]: + experience_data[father_node]["success"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } + else: + experience_data[father_node]["failure"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } + except Exception as e: + print(f"Error processing {round_dir}: {str(e)}") + + experience_data = dict(experience_data) + + output_path = os.path.join(rounds_dir, "processed_experience.json") + with open(output_path, "w", encoding="utf-8") as outfile: + json.dump(experience_data, outfile, indent=4, ensure_ascii=False) + + print(f"Processed experience data saved to {output_path}") + return experience_data + + def format_experience(self, processed_experience, sample_round): + experience_data = processed_experience.get(sample_round) + if experience_data: + experience = f"Original Score: {experience_data['score']}\n" + experience += "These are some conclusions drawn from experience:\n\n" + for key, value in experience_data["failure"].items(): + experience += f"-Absolutely prohibit {value['modification']} (Score: {value['score']})\n" + for key, value in experience_data["success"].items(): + experience += f"-Absolutely prohibit {value['modification']} \n" + experience += "\n\nNote: Take into account past failures and avoid repeating the same mistakes, as these failures indicate that these approaches are ineffective. You must fundamentally change your way of thinking, rather than simply using more advanced Python syntax like for, if, else, etc., or modifying the prompt." + else: + experience = f"No experience data found for round {sample_round}." + return experience + + def load_log(self, cur_round): + log_dir = os.path.join(self.results_path, f"round_{cur_round}/log.json") + if not os.path.exists(log_dir): + return "" + print(log_dir) + if not Path(log_dir).exists(): + raise FileNotFoundError(f"json_file: {log_dir} not exist, return []") + with open(log_dir, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {log_dir} failed") + + if isinstance(data, dict): + data = [data] + elif not isinstance(data, list): + data = list(data) + + if not data: + return "" + + sample_size = min(3, len(data)) + random_samples = random.sample(data, sample_size) + + log = "" + for sample in random_samples: + log += json.dumps(sample, indent=4, ensure_ascii=False) + "\n\n" + + return log + + def check_modification(self, processed_experience, modification, sample_round): + experience_data = processed_experience.get(sample_round) + if experience_data: + for key, value in experience_data["failure"].items(): + if value["modification"] == modification: + return False + for key, value in experience_data["success"].items(): + if value["modification"] == modification: + return False + return True + else: + return True + + def print_results(self): + """ + Print average score and standard deviation for all rounds. + """ + rounds_dir = os.path.normpath(self.results_path) + result_file = os.path.join(rounds_dir, "results.json") + # Ensure directory exists + os.makedirs(rounds_dir, exist_ok=True) + # If file doesn't exist, create a new one with an empty list + if not os.path.exists(result_file): + with open(result_file, "w") as file: + json.dump([], file) + # Read file and return data + with open(result_file, "r") as file: + return json.load(file) + rounds = {} + for entry in self.data: + round_number = entry["round"] + score = entry["score"] + if round_number not in rounds: + rounds[round_number] = [] + rounds[round_number].append(score) + return rounds + sorted_rounds = sorted(self.rounds.items(), key=lambda x: x[0]) + avg_scores = [] + stds = [] + for round_number, scores in sorted_rounds: + avg_scores.append(np.mean(scores)) + stds.append(np.std(scores)) + return avg_scores, stds + for i, (avg_score, std) in enumerate(zip(self.avg_scores, self.stds), 1): + print(f"Round {i}: Average Score = {avg_score:.4f}, Standard Deviation = {std:.4f}") + + def extract(self,response): + TAG = "CONTENT" + req_key=f"[/{TAG}]" + + def re_extract_content(cont,pattern): + matches = re.findall(pattern, cont, re.DOTALL) + for match in matches: + if match: + cont = match + break + return cont.strip() + raw_content = copy.deepcopy(response) + pattern = r"\[CONTENT\]([\s\S]*)\[/CONTENT\]" + new_content = re_extract_content(raw_content, pattern) + if not new_content.startswith("{"): + # TODO find a more general pattern + # # for `[CONTENT]xxx[CONTENT]xxxx[/CONTENT] situation + print(f"extract_content try another pattern: {pattern}") + if req_key not in new_content: + raw_content = copy.deepcopy(new_content + "\n" + req_key) + # # pattern = r"\[CONTENT\](\s*\{.*?\}\s*)\[/CONTENT\]" + new_content = re_extract_content(raw_content, pattern) + else: + if req_key in new_content: + idx = new_content.find(req_key) + new_content = new_content[:idx] + new_content = new_content.strip() + return json.JSONDecoder(strict=False).decode(new_content,_w=json.decoder.WHITESPACE.match) + + @staticmethod + def xml_extract(context: str,field_names :list,field_types) -> Dict[str, Any]: + """ + Fill context with XML tags and convert according to field types, including string, integer, boolean, list and dict types + """ + extracted_data: Dict[str, Any] = {} + + for field_name in field_names: + pattern = rf"<{field_name}>(.*?)" + match = re.search(pattern, context, re.DOTALL) + if match: + raw_value = match.group(1).strip() + field_type = field_types.get(field_name) + + if field_type == str: + extracted_data[field_name] = raw_value + elif field_type == int: + try: + extracted_data[field_name] = int(raw_value) + except ValueError: + extracted_data[field_name] = 0 + elif field_type == bool: + extracted_data[field_name] = raw_value.lower() in ("true", "yes", "1", "on", "True") + elif field_type == list: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], list): + raise ValueError + except: + extracted_data[field_name] = [] + elif field_type == dict: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], dict): + raise ValueError + except: + extracted_data[field_name] = {} + + return extracted_data + + def save_optimized_graph(self): + top_rounds = self.get_top_rounds() + sample,items = self.select_round(top_rounds) + graph_path = Path(self.results_path) + self.optimized_round=items[0]["round"] + + source_round = graph_path / f"round_{self.optimized_round}" + dest_round = graph_path / "best_workflow" + if source_round.exists(): + shutil.copytree(source_round, dest_round, dirs_exist_ok=True) + else: + raise FileNotFoundError(f"The source folder {source_round} does not exist.") + \ No newline at end of file diff --git a/methods/aflow/aflow_math.py b/methods/aflow/aflow_math.py new file mode 100644 index 0000000..e2887f8 --- /dev/null +++ b/methods/aflow/aflow_math.py @@ -0,0 +1,667 @@ +import copy +import importlib +import datetime,os,json,asyncio,re,random +import pandas as pd +import numpy as np +import shutil +import time +import warnings + +from termcolor import colored +from pathlib import Path +from collections import defaultdict +from tqdm.asyncio import tqdm_asyncio +from typing import Dict,Any,Tuple +from pydantic_core import to_jsonable_python + +from .all_prompt import * +from .evaluate import evaluate_math +from ..mas_base import MAS +from ..utils import load_config + +warnings.filterwarnings("ignore", category=SyntaxWarning, message="invalid escape sequence") + +class AFlow_MATH(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + method_config_name = "config" if method_config_name is None else method_config_name + self.method_config = load_config( + Path(__file__).parent / "configs" / f"{method_config_name}.yaml" + ) + self.dataset_name = general_config['test_dataset_name'] + self.model_name_optimize = self.method_config.get('optimize_meta_model_name','claude-3-5-sonnet-20241022') + self.model_name_execute = self.method_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.sample = self.method_config['sample'] + self.max_rounds = self.method_config['max_rounds'] + self.validation_rounds = self.method_config['validation_rounds'] + self.earlystop = self.method_config['earlystop'] + self.root_path = str(os.path.relpath(Path(__file__).parent, start=os.getcwd())) + self.results_path = f"results/{self.dataset_name}/aflow/{self.model_name_optimize}/{self.model_name_execute}" + self.top_scores = [] + self.round = 1 + self.graph = None + + self.operators:list = ["Custom", "ScEnsemble", "Programmer"] + self.type = "math" + + results_path = Path(self.results_path) + if not results_path.exists(): + graph_path = Path(self.root_path) / "initial_workflows" / "math" + results_path.mkdir(parents=True, exist_ok=True) + exp_path = os.path.join(self.results_path, "processed_experience.json") + res_path = os.path.join(self.results_path, "results.json") + with open(exp_path, 'w') as f: + pass + with open(res_path, 'w') as f: + pass + for item in graph_path.iterdir(): + dest = results_path / item.name + if item.is_dir(): + shutil.copytree(item, dest, dirs_exist_ok=True) + else: + shutil.copy2(item, dest) + + self.optimized_round = 1 + self.inference_flag = True + + + async def inference(self, sample: Dict[str, Any]) -> Dict[str, Any]: + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + self.inference_flag = True + optimized_path = Path(self.results_path) / "best_workflow" + if optimized_path.exists(): + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.best_workflow.graph" + else: + raise NotImplementedError("Best_workflow path does not exist!") + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + + graph = self.graph(name="Optimized", env=self) + + response = await graph(problem=query) + #print("Raw response: ",response) + return response + + def optimizing(self,val_dataset): + self.inference_flag = False + + optimized_path = Path(self.results_path) / "best_workflow" + if optimized_path.exists(): + print(colored("The optimal graph already exists!\n","red")) + return + + print(colored("Start optimizing ...\n","yellow")) + for i in range(self.max_rounds): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + retry_count = 0 + max_retries = 1 + while retry_count < max_retries: + try: + print(colored(f"{i+1} round of optimization...\n","light_cyan")) + score = loop.run_until_complete(self._optimize_graph(val_dataset)) + break + except Exception as e: + retry_count += 1 + print(f"Optimization failed: {e}") + if retry_count == max_retries: + score = None + wait_time = 5 * retry_count + time.sleep(wait_time) + + if retry_count < max_retries: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + self.round += 1 + print(f"Score for round {self.round}: {score}") + self.save_optimized_graph() + converged, convergence_round, final_round = self.check_convergence(top_k=3) + if self.earlystop and converged: + print(f"Convergence detected, occurred in round {convergence_round}, final round is {final_round}") + self.print_results() + break + + + print(colored("Optimization complete!","green")) + print(colored(f"\n>> Optimization token stats: {self.get_token_stats()}","light_yellow")) + token_path = os.path.join(self.results_path,"api_token.json") + os.makedirs(os.path.dirname(token_path), exist_ok=True) + with open(token_path,"w") as f: + json.dump(self.get_token_stats(), f, indent=4) + + async def _optimize_graph(self,val_dataset): + validation_n = self.validation_rounds + graph_path = self.results_path + result_path = os.path.join(graph_path, "results.json") + data=[] + if os.path.exists(result_path): + with open(result_path, "r") as json_file: + try: + data = json.load(json_file) + except json.JSONDecodeError: + data = [] + else: + data = [] + if self.round == 1: + directory = os.path.join(graph_path, f"round_{self.round}") + os.makedirs(directory, exist_ok=True) + + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.round_{self.round}.graph" + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + avg_score = await self.evaluate_graph(directory, validation_n, data,val_dataset,True) + + + while True: + directory = os.path.join(graph_path, f"round_{self.round+1}") + os.makedirs(directory, exist_ok=True) + + #parent <- SelectParent(results) + top_rounds = self.get_top_rounds() + sample,_ = self.select_round(top_rounds) + + prompt, graph_load = self.read_graph_files(sample["round"], graph_path) + pattern = r"class Workflow:.+" + graph = re.findall(pattern, graph_load, re.DOTALL) + + #context <- LoadContext(parent,experiences) + processed_experience = self.load_experience() + experience = self.format_experience(processed_experience, sample["round"]) + + path = os.path.join(graph_path, "template/operator.json") + operators_description = "" + for id, operator in enumerate(self.operators): + with open(path, "r") as f: + operator_data = json.load(f) + matched_data = operator_data[operator] + desc = matched_data["description"] + interface = matched_data["interface"] + operator_description = f"{id+1}. {operator}: {desc}, with interface {interface})." + operators_description += f"{operator_description}\n" + + log_data = self.load_log(sample["round"]) + + graph_input = WORKFLOW_INPUT.format( + experience=experience, + score=sample["score"], + graph=graph[0], + prompt=prompt, + operator_description=operators_description, + type=self.type, + log=log_data, + ) + graph_system = WORKFLOW_OPTIMIZE_PROMPT.format(type=self.type) + graph_optimize_prompt = graph_input + WORKFLOW_CUSTOM_USE + graph_system + names = ["modification","graph","prompt"] + types = {"modification":str,"graph":str,"prompt":str} + examples = [] + for name in names: + examples.append(f"<{name}>content") + + example_str = "\n".join(examples) + instructions = graph_optimize_prompt+ f"\n# Response format (must be strictly followed) (do not include any other formats except for the given XML format):\n{example_str}" + response = await self.async_call_llm(prompt=instructions,model_name=self.model_name_optimize) + + response = self.xml_extract(response,names,types) + # Check if the modification meets the conditions + check = self.check_modification( + processed_experience, response["modification"], sample["round"] + ) + + # If `check` is True, break the loop; otherwise, regenerate the graph + if check: + break + + # Save the graph and evaluate + graph = WORKFLOW_TEMPLATE.format(graph=response["graph"], round=self.round + 1, dataset=self.dataset_name) + + with open(os.path.join(directory, "graph.py"), "w", encoding="utf-8") as file: + file.write(graph) + + with open(os.path.join(directory, "prompt.py"), "w", encoding="utf-8") as file: + file.write(response["prompt"]) + + with open(os.path.join(directory, "__init__.py"), "w", encoding="utf-8") as file: + file.write("") + experience = { + "father node": sample["round"], + "modification": response["modification"], + "before": sample["score"], + "after": None, + "succeed": None, + } + graph_module_name = f"results.{self.dataset_name}.aflow.{self.model_name_optimize}.{self.model_name_execute}.round_{self.round+1}.graph" + module = importlib.import_module(graph_module_name, package=__package__) + self.graph = getattr(module, "Workflow") + avg_score = await self.evaluate_graph(directory, validation_n, data,val_dataset) + + experience["after"] = avg_score + experience["succeed"] = bool(avg_score > experience["before"]) + folder_path = Path(os.path.join(directory, "experience.json")).parent + if not folder_path.exists(): + folder_path.mkdir(parents=True, exist_ok=True) + + with open(os.path.join(directory, "experience.json"), "w", encoding="utf-8") as fout: + json.dump(experience, fout, ensure_ascii=False, indent=4, default=to_jsonable_python) + return avg_score + + async def evaluate_graph(self, directory, validation_n, data, val_dataset,initial=False): + sum_score = 0 + max_workers = 50 + for _ in range(validation_n): + graph = self.graph(name=self.dataset_name+f"/round_{self.round}", env=self) + semaphore = asyncio.Semaphore(max_workers) + async def sem_evaluate(problem): + async with semaphore: + return await evaluate_math(problem, graph, directory) + tasks = [sem_evaluate(problem) for problem in val_dataset] + results = await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.type} problems", total=len(val_dataset)) + columns = ["question", "prediction", "expected_output", "score"] + df = pd.DataFrame(results, columns=columns) + average_score = df["score"].mean() + + + cur_round = self.round + 1 if initial is False else self.round + now = datetime.datetime.now() + new_data = {"round": cur_round, "score": average_score,"time": now} + data.append(new_data) + + result_path = os.path.join(self.results_path, "results.json") + folder_path = Path(result_path).parent + if not folder_path.exists(): + folder_path.mkdir(parents=True, exist_ok=True) + + with open(result_path, "w", encoding="utf-8") as fout: + json.dump(data, fout, ensure_ascii=False, indent=4, default=to_jsonable_python) + sum_score += average_score + + return sum_score / validation_n + + def check_convergence(self, top_k=3, z=0, consecutive_rounds=5): + result_file = os.path.join(self.results_path, "results.json") + with open(result_file, "r") as file: + self.data = json.load(file) + rounds = {} + for entry in self.data: + round_number = entry["round"] + score = entry["score"] + if round_number not in rounds: + rounds[round_number] = [] + rounds[round_number].append(score) + self.rounds = rounds + sorted_rounds = sorted(self.rounds.items(), key=lambda x: x[0]) + avg_scores = [] + stds = [] + for round_number, scores in sorted_rounds: + avg_scores.append(np.mean(scores)) + stds.append(np.std(scores)) + # If total rounds are not enough to calculate top_k+1 rounds, return not converged + if len(avg_scores) < top_k + 1: + return False, None, None + convergence_count = 0 # Convergence counter + previous_y = None # Y value of the previous round (average of top_k scores) + sigma_y_previous = None # Standard error of Y value from previous round + for i in range(len(avg_scores)): + # Dynamically select top_k from current round and all previous rounds + top_k_indices = np.argsort(avg_scores[: i + 1])[::-1][:top_k] # Select top k indices by descending average score + top_k_scores = [avg_scores[j] for j in top_k_indices] # Get list of top k scores + top_k_stds = [ + stds[j] for j in top_k_indices + ] # Get list of standard deviations corresponding to top k scores + # Calculate mean of top k scores for current round, i.e., y_current + y_current = np.mean(top_k_scores) + # Calculate standard error of y_current (sigma_y_current), representing score dispersion + sigma_y_current = np.sqrt(np.sum([s**2 for s in top_k_stds]) / (top_k**2)) + # If not the first round, calculate change in Y (Delta_Y) and corresponding standard error + if previous_y is not None: + # Calculate Y difference between current round and previous round + delta_y = y_current - previous_y + # Calculate standard error of Y difference (sigma_Delta_Y) + sigma_delta_y = np.sqrt(sigma_y_current**2 + sigma_y_previous**2) + # Check if Y change is within acceptable confidence interval, i.e., convergence condition + if abs(delta_y) <= z * sigma_delta_y: + convergence_count += 1 + # If consecutive converged rounds reach set value, return convergence information + if convergence_count >= consecutive_rounds: + return True, i - consecutive_rounds + 1, i + else: + # If change is large, reset convergence counter + convergence_count = 0 + # Update Y value and standard error for previous round + previous_y = y_current + sigma_y_previous = sigma_y_current + # If convergence condition not met, return not converged + return False, None, None + + def get_top_rounds(self): + rounds_dir = self.results_path + result_file = os.path.join(rounds_dir, "results.json") + self.top_scores = [] + if not Path(result_file).exists(): + raise FileNotFoundError(f"json_file: {result_file} not exist, return []") + with open(result_file, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {result_file} failed") + df = pd.DataFrame(data) + + scores_per_round = df.groupby("round")["score"].mean().to_dict() + + for round_number, average_score in scores_per_round.items(): + self.top_scores.append({"round": int(round_number), "score": average_score}) + + self.top_scores.sort(key=lambda x: x["score"], reverse=True) + + unique_rounds = set() + unique_top_scores = [] + + first_round = next((item for item in self.top_scores if item["round"] == 1), None) + if first_round: + unique_top_scores.append(first_round) + unique_rounds.add(1) + + for item in self.top_scores: + if item["round"] not in unique_rounds: + unique_top_scores.append(item) + unique_rounds.add(item["round"]) + + if len(unique_top_scores) >= self.sample: + break + + return unique_top_scores + + def select_round(self, items,alpha=0.2, lambda_=0.3): + + if not items: raise ValueError("Item list is empty.") + + sorted_items = sorted(items, key=lambda x: x["score"], reverse=True) + scores = [item["score"] * 100 for item in sorted_items] + + scores = np.array(scores, dtype=np.float64) + n = len(scores) + + if n == 0: + raise ValueError("Score list is empty.") + + uniform_prob = np.full(n, 1.0 / n, dtype=np.float64) + + max_score = np.max(scores) + shifted_scores = scores - max_score + exp_weights = np.exp(alpha * shifted_scores) + + sum_exp_weights = np.sum(exp_weights) + if sum_exp_weights == 0: + raise ValueError("Sum of exponential weights is 0, cannot normalize.") + + score_prob = exp_weights / sum_exp_weights + + mixed_prob = lambda_ * uniform_prob + (1 - lambda_) * score_prob + + total_prob = np.sum(mixed_prob) + if not np.isclose(total_prob, 1.0): + mixed_prob = mixed_prob / total_prob + + + print(f"\nMixed probability distribution: {mixed_prob}") + print(f"\nSorted rounds: {sorted_items}") + + selected_index = np.random.choice(len(sorted_items), p=mixed_prob) + print(f"\nSelected index: {selected_index}, Selected item: {sorted_items[selected_index]}") + + return sorted_items[selected_index], sorted_items + + def read_graph_files(self, round_number: int, workflows_path: str): + prompt_file_path = os.path.join(workflows_path, f"round_{round_number}", "prompt.py") + graph_file_path = os.path.join(workflows_path, f"round_{round_number}", "graph.py") + + try: + with open(prompt_file_path, "r", encoding="utf-8") as file: + prompt_content = file.read() + with open(graph_file_path, "r", encoding="utf-8") as file: + graph_content = file.read() + except FileNotFoundError as e: + print(f"Error: File not found for round {round_number}: {e}") + raise + except Exception as e: + print(f"Error loading prompt for round {round_number}: {e}") + raise + return prompt_content, graph_content + + def load_experience(self): + rounds_dir = os.path.normpath(self.results_path) + experience_data = defaultdict(lambda: {"score": None, "success": {}, "failure": {}}) + + for round_dir in os.listdir(rounds_dir): + if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"): + round_path = os.path.join(rounds_dir, round_dir) + try: + round_number = int(round_dir.split("_")[1]) + json_file_path = os.path.join(round_path, "experience.json") + if os.path.exists(json_file_path): + if not Path(json_file_path).exists(): + raise FileNotFoundError(f"json_file: {json_file_path} not exist, return []") + with open(json_file_path, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {json_file_path} failed") + + father_node = data["father node"] + + if experience_data[father_node]["score"] is None: + experience_data[father_node]["score"] = data["before"] + + if data["succeed"]: + experience_data[father_node]["success"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } + else: + experience_data[father_node]["failure"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } + except Exception as e: + print(f"Error processing {round_dir}: {str(e)}") + + experience_data = dict(experience_data) + + output_path = os.path.join(rounds_dir, "processed_experience.json") + with open(output_path, "w", encoding="utf-8") as outfile: + json.dump(experience_data, outfile, indent=4, ensure_ascii=False) + + print(f"Processed experience data saved to {output_path}") + return experience_data + + def format_experience(self, processed_experience, sample_round): + experience_data = processed_experience.get(sample_round) + if experience_data: + experience = f"Original Score: {experience_data['score']}\n" + experience += "These are some conclusions drawn from experience:\n\n" + for key, value in experience_data["failure"].items(): + experience += f"-Absolutely prohibit {value['modification']} (Score: {value['score']})\n" + for key, value in experience_data["success"].items(): + experience += f"-Absolutely prohibit {value['modification']} \n" + experience += "\n\nNote: Take into account past failures and avoid repeating the same mistakes, as these failures indicate that these approaches are ineffective. You must fundamentally change your way of thinking, rather than simply using more advanced Python syntax like for, if, else, etc., or modifying the prompt." + else: + experience = f"No experience data found for round {sample_round}." + return experience + + def load_log(self, cur_round): + log_dir = os.path.join(self.results_path, f"round_{cur_round}/log.json") + if not os.path.exists(log_dir): + return "" + print(log_dir) + if not Path(log_dir).exists(): + raise FileNotFoundError(f"json_file: {log_dir} not exist, return []") + with open(log_dir, "r", encoding="utf-8") as fin: + try: + data = json.load(fin) + except Exception: + raise ValueError(f"read json file: {log_dir} failed") + + if isinstance(data, dict): + data = [data] + elif not isinstance(data, list): + data = list(data) + + if not data: + return "" + + sample_size = min(3, len(data)) + random_samples = random.sample(data, sample_size) + + log = "" + for sample in random_samples: + log += json.dumps(sample, indent=4, ensure_ascii=False) + "\n\n" + + return log + + def check_modification(self, processed_experience, modification, sample_round): + experience_data = processed_experience.get(sample_round) + if experience_data: + for key, value in experience_data["failure"].items(): + if value["modification"] == modification: + return False + for key, value in experience_data["success"].items(): + if value["modification"] == modification: + return False + return True + else: + return True + + def print_results(self): + """ + Print average score and standard deviation for all rounds. + """ + rounds_dir = os.path.normpath(self.results_path) + result_file = os.path.join(rounds_dir, "results.json") + # Ensure directory exists + os.makedirs(rounds_dir, exist_ok=True) + # If file doesn't exist, create a new one with an empty list + if not os.path.exists(result_file): + with open(result_file, "w") as file: + json.dump([], file) + # Read file and return data + with open(result_file, "r") as file: + return json.load(file) + rounds = {} + for entry in self.data: + round_number = entry["round"] + score = entry["score"] + if round_number not in rounds: + rounds[round_number] = [] + rounds[round_number].append(score) + return rounds + sorted_rounds = sorted(self.rounds.items(), key=lambda x: x[0]) + avg_scores = [] + stds = [] + for round_number, scores in sorted_rounds: + avg_scores.append(np.mean(scores)) + stds.append(np.std(scores)) + return avg_scores, stds + for i, (avg_score, std) in enumerate(zip(self.avg_scores, self.stds), 1): + print(f"Round {i}: Average Score = {avg_score:.4f}, Standard Deviation = {std:.4f}") + + def extract(self,response): + TAG = "CONTENT" + req_key=f"[/{TAG}]" + + def re_extract_content(cont,pattern): + matches = re.findall(pattern, cont, re.DOTALL) + for match in matches: + if match: + cont = match + break + return cont.strip() + raw_content = copy.deepcopy(response) + pattern = r"\[CONTENT\]([\s\S]*)\[/CONTENT\]" + new_content = re_extract_content(raw_content, pattern) + if not new_content.startswith("{"): + # TODO find a more general pattern + # # for `[CONTENT]xxx[CONTENT]xxxx[/CONTENT] situation + print(f"extract_content try another pattern: {pattern}") + if req_key not in new_content: + raw_content = copy.deepcopy(new_content + "\n" + req_key) + # # pattern = r"\[CONTENT\](\s*\{.*?\}\s*)\[/CONTENT\]" + new_content = re_extract_content(raw_content, pattern) + else: + if req_key in new_content: + idx = new_content.find(req_key) + new_content = new_content[:idx] + new_content = new_content.strip() + return json.JSONDecoder(strict=False).decode(new_content,_w=json.decoder.WHITESPACE.match) + + @staticmethod + def xml_extract(context: str,field_names :list,field_types) -> Dict[str, Any]: + """ + Fill context with XML tags and convert according to field types, including string, integer, boolean, list and dict types + """ + extracted_data: Dict[str, Any] = {} + + for field_name in field_names: + pattern = rf"<{field_name}>(.*?)" + match = re.search(pattern, context, re.DOTALL) + if match: + raw_value = match.group(1).strip() + field_type = field_types.get(field_name) + + if field_type == str: + extracted_data[field_name] = raw_value + elif field_type == int: + try: + extracted_data[field_name] = int(raw_value) + except ValueError: + extracted_data[field_name] = 0 + elif field_type == bool: + extracted_data[field_name] = raw_value.lower() in ("true", "yes", "1", "on", "True") + elif field_type == list: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], list): + raise ValueError + except: + extracted_data[field_name] = [] + elif field_type == dict: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], dict): + raise ValueError + except: + extracted_data[field_name] = {} + + return extracted_data + + def validate_response(self, response: str) -> Tuple[bool, dict]: + """Validate if the response contains all required fields in XML format""" + try: + pattern = r"<(\w+)>(.*?)" + matches = re.findall(pattern, response, re.DOTALL) + + found_fields = {match[0]: match[1].strip() for match in matches} + + for field_name in self._get_field_names(): + field = self.model.model_fields[field_name] + + return True, found_fields + except Exception: + return False, None + + def save_optimized_graph(self): + top_rounds = self.get_top_rounds() + sample,items = self.select_round(top_rounds) + graph_path = Path(self.results_path) + self.optimized_round=items[0]["round"] + + source_round = graph_path / f"round_{self.optimized_round}" + dest_round = graph_path / "best_workflow" + if source_round.exists(): + shutil.copytree(source_round, dest_round, dirs_exist_ok=True) + else: + raise FileNotFoundError(f"The source folder {source_round} does not exist.") + \ No newline at end of file diff --git a/methods/aflow/all_prompt.py b/methods/aflow/all_prompt.py new file mode 100644 index 0000000..dd85a6f --- /dev/null +++ b/methods/aflow/all_prompt.py @@ -0,0 +1,57 @@ +WORKFLOW_OPTIMIZE_PROMPT = """You are building a Graph and corresponding Prompt to jointly solve {type} problems. +Referring to the given graph and prompt, which forms a basic example of a {type} solution approach, +please reconstruct and optimize them. You can add, modify, or delete nodes, parameters, or prompts. Include your +single modification in XML tags in your reply. Ensure they are complete and correct to avoid runtime failures. When +optimizing, you can incorporate critical thinking methods like review, revise, ensemble (generating multiple answers through different/similar prompts, then voting/integrating/checking the majority to obtain a final answer), selfAsk, etc. Consider +Python's loops (for, while, list comprehensions), conditional statements (if-elif-else, ternary operators), +or machine learning techniques (e.g., linear regression, decision trees, neural networks, clustering). The graph +complexity should not exceed 10. Use logical and control flow (IF-ELSE, loops) for a more enhanced graphical +representation.Ensure that all the prompts required by the current graph from prompt_custom are included.Exclude any other prompts. +Output the modified graph and all the necessary Prompts in prompt_custom (if needed). +The prompt you need to generate is only the one used in `prompt_custom.XXX` within Custom. Other methods already have built-in prompts and are prohibited from being generated. Only generate those needed for use in `prompt_custom`; please remove any unused prompts in prompt_custom. +the generated prompt must not contain any placeholders. +Considering information loss, complex graphs may yield better results, but insufficient information transmission can omit the solution. It's crucial to include necessary context during the process.""" + + +WORKFLOW_INPUT = """ +Here is a graph and the corresponding prompt (prompt only related to the custom method) that performed excellently in a previous iteration (maximum score is 1). You must make further optimizations and improvements based on this graph. The modified graph must differ from the provided example, and the specific differences should be noted within the xxx section.\n + + {experience} + (such as:add /delete /modify/ ...) + {score} + {graph} + {prompt}(only prompt_custom) + {operator_description} + +Below are the logs of some results with the aforementioned Graph that performed well but encountered errors, which can be used as references for optimization: +{log} + +First, provide optimization ideas. **Only one detail point can be modified at a time**, and no more than 5 lines of code may be changed per modification—extensive modifications are strictly prohibited to maintain project focus! +When introducing new functionalities in the graph, please make sure to import the necessary libraries or modules yourself, except for operator, prompt_custom, create_llm_instance, and CostManage, which have already been automatically imported. +**Under no circumstances should Graph output None for any field.** +Use custom methods to restrict your output format, rather than using code (outside of the code, the system will extract answers based on certain rules and score them). +It is very important to format the Graph output answers, you can refer to the standard answer format in the log. +""" + +WORKFLOW_CUSTOM_USE = """Here's an example of using the `custom` method in graph: +``` +# You can write your own prompt in prompt_custom and then use it in the Custom method in the graph +response = await self.custom(input=problem, instruction=prompt_custom.XXX_PROMPT) +# You can also concatenate previously generated string results in the input to provide more comprehensive contextual information. +# response = await self.custom(input=problem+f"xxx:{xxx}, xxx:{xxx}", instruction=prompt_custom.XXX_PROMPT) +# The output from the Custom method can be placed anywhere you need it, as shown in the example below +solution = await self.generate(problem=f"question:{problem}, xxx:{response['response']}") +``` +Note: In custom, the input and instruction are directly concatenated(instruction+input), and placeholders are not supported. Please ensure to add comments and handle the concatenation externally.\n + +**Introducing multiple operators at appropriate points can enhance performance. If you find that some provided operators are not yet used in the graph, try incorporating them.** +""" + +WORKFLOW_TEMPLATE = """from typing import Literal +from ..template import operator as operator +from ..round_{round} import prompt as prompt_custom + +DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] + +{graph} +""" diff --git a/methods/aflow/configs/config.yaml b/methods/aflow/configs/config.yaml new file mode 100644 index 0000000..f7d8fde --- /dev/null +++ b/methods/aflow/configs/config.yaml @@ -0,0 +1,6 @@ +sample: 4 +max_rounds: 1 +validation_rounds: 1 +earlystop: True +optimize_meta_model_name: "claude-3-5-sonnet-20241022" +optimize_execute_model_name: "gpt-4o-mini-2024-07-18" \ No newline at end of file diff --git a/methods/aflow/evaluate.py b/methods/aflow/evaluate.py new file mode 100644 index 0000000..75ec1ff --- /dev/null +++ b/methods/aflow/evaluate.py @@ -0,0 +1,244 @@ +import asyncio +import threading +import regex,re,json,inspect,time +from termcolor import colored +from typing import Any, List,Tuple,Callable,Dict,Optional +from math import isclose +from pathlib import Path +from pydantic_core import to_jsonable_python +from sympy import N, simplify +from sympy.parsing.latex import parse_latex +from sympy.parsing.sympy_parser import parse_expr +from .initial_workflows.math.template.sanitize import sanitize + +def write_json_file(json_file: str, data: list, encoding: str = None, indent: int = 4): + folder_path = Path(json_file).parent + if not folder_path.exists(): + folder_path.mkdir(parents=True, exist_ok=True) + with open(json_file, "w", encoding=encoding) as fout: + json.dump(data, fout, ensure_ascii=False, indent=indent, default=to_jsonable_python) + +def extract_model_answer(text: str) -> str: + pattern = r"\\boxed{((?:[^{}]|{[^{}]*})*)}" + boxed_matches = re.findall(pattern, text, re.DOTALL) + if boxed_matches: + return boxed_matches[-1].strip() + + sentence_end_pattern = r"(? Tuple[str, str, str, int, float]: + input_text = problem["query"] + expected_output = problem["gt"] + + try: + output = await graph(input_text) + #print(output) + expected_answer = extract_model_answer(expected_output) + predicted_answer = extract_model_answer(output) + + if math_equal(predicted_answer, expected_answer): + uni_score, extracted_output = 1, predicted_answer + else: + uni_score, extracted_output = 0, predicted_answer + + if uni_score == 0: + log_mismatch( + input_text, + expected_output, + output, + extracted_output, + extract_answer_code=get_function_code(extract_model_answer), + log_path=log_path + ) + return input_text, output, expected_output, uni_score + + except Exception as e: + print(colored(f"Maximum retries reached. Skipping this sample. Error: {e}","light_red")) + return input_text, str(e), expected_output, 0.0 + +def math_equal(prediction: Any, reference: Any) -> bool: + if str(prediction) == str(reference): + return True + try: + if is_digit(prediction) and is_digit(reference): + prediction = parse_digits(prediction) + reference = parse_digits(reference) + return isclose(prediction, reference, abs_tol=1e-3) + except: + pass + + try: + return symbolic_equal(prediction, reference) + except: + pass + return False + +def is_digit(num): + return parse_digits(num) is not None + +def parse_digits(num): + num = regex.sub(",", "", str(num)) + try: + return float(num) + except: +# When the original input is a percentage in LaTeX format (e.g., 50\%), +# a backslash remains after processing, causing the float conversion to +# fail returning None, and subsequent math operations may produce type errors. +# num = num.replace("\\%", "").replace("%", "") + if num.endswith("%"): + num = num[:-1] + if num.endswith("\\"): + num = num[:-1] + try: + return float(num) / 100 + except: + pass + return None + +def get_function_code(func): + try: + source_code = inspect.getsource(func) + return source_code + except OSError: + return "no code" + +def symbolic_equal(a, b): + def _parse(s): + for f in [parse_latex, parse_expr]: + try: + return f(s) + except: + pass + return s + + a = _parse(a) + b = _parse(b) + + try: + if simplify(a - b) == 0: + return True + except: + pass + + try: + if isclose(N(a), N(b), abs_tol=1e-3): + return True + except: + pass + return False +def log_mismatch(problem: str,expected_output: Any,prediction: str,extracted_output: Any,extract_answer_code: str = "None",log_path=None): + log_data = { + "question":problem, + "right_answer": expected_output, + "model_output": prediction, + "extracted_output": extracted_output, + "extract_answer_code": extract_answer_code, + } + + log_file = Path(log_path) / "log.json" + if log_file.exists(): + with log_file.open("r", encoding="utf-8") as f: + try: + data = json.load(f) + except json.JSONDecodeError: + data = [] + else: + data = [] + data.append(log_data) + write_json_file(log_file, data, encoding="utf-8", indent=4) + +async def evaluate_mbpp(data: dict, graph: Callable,log_path:str) -> Tuple[str, str, str, float, float]: + input_text = data["prompt"] + expected_output = "\nCorrect Solution:\ndef " + data["code"] + + try: + # Generate prediction using the graph function + prediction = await graph(input_text, data["entry_point"]) + # Check the solution + ret = check_solution(prediction, data["test"], data["entry_point"]) + test_case_details = ret[1] + expected_output = test_case_details + "\nCorrect Solution:" + data["code"] + + # Calculate score based on the check result + score = 1.0 if ret[0] == "PASS" else 0.0 + + # Log mismatch if the score is 0 + if score == 0: + log_mismatch(input_text, expected_output, prediction, score,log_path=log_path) + + return input_text, prediction, expected_output, score + + except Exception as e: + print(colored(f"Maximum retries reached. Skipping this sample. Error: {e}","light_red")) + return input_text, str(e), expected_output, 0.0 + +def check_solution(solution, test, entry_point): + solution = sanitize(code=solution, entrypoint=entry_point) + try: + global_dict = { + "math": __import__("math"), + "hashlib": __import__("hashlib"), + "re": __import__("re"), + "List": List, + "Dict": Dict, + "Tuple": Tuple, + "Optional": Optional, + "Any": Any, + } + + exec(solution, global_dict) + + if entry_point not in global_dict: + raise ValueError(f"Function {entry_point} is not defined in the solution.") + + exec(test, global_dict) + + check = global_dict["check"] + + result = run_with_timeout(check, 15) + + if result is None: + result = ("PASS", "The solution passed all test cases.") + + except Exception: + result = ( + "FAIL", + "Execution timed out. Please check if your solution contains infinite loops or overly time-consuming operations.", + ) + except Exception as e: + error_message = f"Error: {str(e)}.\n Solution: {solution}.\n Test: {test}" + result = ("FAIL", error_message) + + with open("error.log", "a", encoding="utf-8") as log_file: + log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n") + + return result + +def run_with_timeout(func, timeout): + result = [] + stop_event = threading.Event() + + def target(): + try: + result.append(func()) + except Exception as e: + result.append(e) + finally: + stop_event.set() + + thread = threading.Thread(target=target) + thread.start() + is_timeout = not stop_event.wait(timeout) + + if is_timeout: + raise Exception("Function execution timed out") + + if not result: + return None + if isinstance(result[0], Exception): + raise result[0] + return result[0] \ No newline at end of file diff --git a/methods/aflow/initial_workflows/math/__init__.py b/methods/aflow/initial_workflows/math/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/math/round_1/__init__.py b/methods/aflow/initial_workflows/math/round_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/math/round_1/graph.py b/methods/aflow/initial_workflows/math/round_1/graph.py new file mode 100644 index 0000000..77b2619 --- /dev/null +++ b/methods/aflow/initial_workflows/math/round_1/graph.py @@ -0,0 +1,19 @@ +from typing import Literal +from ..template import operator +from ..round_1 import prompt as prompt_custom + +DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] +class Workflow: + def __init__(self,name: str,env) -> None: + self.name = name + self.llm=env + self.custom = operator.Custom(self.llm) + + async def __call__(self, problem: str): + """ + Implementation of the workflow + """ + solution = await self.custom(input=problem, instruction="") + return solution['response'] + + \ No newline at end of file diff --git a/methods/aflow/initial_workflows/math/round_1/log.json b/methods/aflow/initial_workflows/math/round_1/log.json new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/math/round_1/prompt.py b/methods/aflow/initial_workflows/math/round_1/prompt.py new file mode 100644 index 0000000..310e53d --- /dev/null +++ b/methods/aflow/initial_workflows/math/round_1/prompt.py @@ -0,0 +1,6 @@ +XXX_PROMPT = """ + +Solve it. + +""" + diff --git a/methods/aflow/initial_workflows/math/template/__init__.py b/methods/aflow/initial_workflows/math/template/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/math/template/op_prompt.py b/methods/aflow/initial_workflows/math/template/op_prompt.py new file mode 100644 index 0000000..af13cc7 --- /dev/null +++ b/methods/aflow/initial_workflows/math/template/op_prompt.py @@ -0,0 +1,29 @@ +SC_ENSEMBLE_PROMPT = """ +Given the question described as follows: {problem} +Several solutions have been generated to address the given question. They are as follows: +{solutions} + +Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution. + +In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field. +""" + +PYTHON_CODE_VERIFIER_PROMPT = """ +You are a professional Python programmer. Your task is to write complete, self-contained code based on a given mathematical problem and output the answer. The code should include all necessary imports and dependencies, and be ready to run without additional setup or environment configuration. + +Problem description: {problem} +Other analysis: {analysis} +{feedback} + +Your code should: +1. Implement the calculation steps described in the problem. +2. Define a function named `solve` that performs the calculation and returns the result. The `solve` function should not require any input parameters; instead, it should obtain all necessary inputs from within the function or from globally defined variables. +3. `solve` function return the final calculation result. + +Please ensure your code is efficient, well-commented, and follows Python best practices. The output should be limited to basic data types such as strings, integers, and floats. It is prohibited to transmit images or other file formats. The code output is intended for a text-based language model. + +Wrap your final code solution in and . For example: + +Your function code here + +""" \ No newline at end of file diff --git a/methods/aflow/initial_workflows/math/template/operator copy.py b/methods/aflow/initial_workflows/math/template/operator copy.py new file mode 100644 index 0000000..584f01c --- /dev/null +++ b/methods/aflow/initial_workflows/math/template/operator copy.py @@ -0,0 +1,233 @@ +import concurrent +import sys +import re +import traceback +from typing import List +from tenacity import retry, stop_after_attempt, wait_fixed + +from .op_prompt import * +from .sanitize import * +import asyncio + +class Operator: + def __init__(self, llm, name: str): + self.name = name + self.llm = llm + + def __call__(self, *args, **kwargs): + raise NotImplementedError + +class Custom(Operator): + def __init__(self, llm, name: str = "Custom"): + super().__init__(llm, name) + + async def __call__(self, input, instruction): + prompt = instruction + input + if self.llm.inference_flag: + response = await self.llm.call_llm(prompt=prompt) + else: + response = await self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + return {"response":response} + +def run_code(code): + try: + # Create a new global namespace + global_namespace = {} + + disallowed_imports = [ + "os", "sys", "subprocess", "multiprocessing", + "matplotlib", "seaborn", "plotly", "bokeh", "ggplot", + "pylab", "tkinter", "PyQt5", "wx", "pyglet" + ] + + # Check for prohibited imports + for lib in disallowed_imports: + if f"import {lib}" in code or f"from {lib}" in code: + + return "Error", f"Prohibited import: {lib} and graphing functionalities" + + # Use exec to execute the code + exec(code, global_namespace) + # Assume the code defines a function named 'solve' + if 'solve' in global_namespace and callable(global_namespace['solve']): + result = global_namespace['solve']() + return "Success", str(result) + else: + return "Error", "Function 'solve' not found" + except Exception as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) + return "Error", f"Execution error: {str(e)}\n{''.join(tb_str)}" + + +class Programmer(Operator): + def __init__(self, llm, name: str = "Programmer"): + super().__init__(llm, name) + + async def exec_code(self, code, timeout=30): + """ + Asynchronously execute code and return an error if timeout occurs. + """ + loop = asyncio.get_running_loop() + with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: + try: + # Submit run_code task to the process pool + future = loop.run_in_executor(executor, run_code, code) + # Wait for the task to complete or timeout + result = await asyncio.wait_for(future, timeout=timeout) + return result + except asyncio.TimeoutError: + # Timeout, attempt to shut down the process pool + executor.shutdown(wait=False, cancel_futures=True) + return "Error", "Code execution timed out" + except Exception as e: + return "Error", f"Unknown error: {str(e)}" + + async def code_generate(self, problem, analysis, feedback): + """ + Asynchronous method to generate code. + """ + prompt = PYTHON_CODE_VERIFIER_PROMPT.format( + problem=problem, + analysis=analysis, + feedback=feedback + ) + code_instructions = ( + "\n\n" + "Please write your code solution in Python. " + "Return ONLY the complete, runnable code without explanations. " + "Use proper Python syntax and formatting. " + ) + prompt = prompt + code_instructions + try: + if self.llm.inference_flag: + response = await self.llm.call_llm(prompt=prompt) + else: + response = await self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + + code = self._extract_code_from_markdown(response) + + # If no code blocks found, treat the entire response as code + if not code: + code = response + + # Use the sanitize function to extract valid code and handle dependencies + sanitized_code = sanitize(code=code, entrypoint=None) + + # If sanitize returned empty string, the code is invalid + if not sanitized_code.strip(): + response = None + + # Return the sanitized code + response = {"code": sanitized_code} + + + if not isinstance(response, dict): + response = {"code": response} + except Exception as e: + response = {"error": str(e)} + + return response + + def _extract_code_from_markdown(self, text: str) -> str: + """ + Extract code from markdown code blocks in the response. + + Args: + text: The text containing possible markdown code blocks + + Returns: + The extracted code as a string, or empty string if no code blocks found + """ + # Look for Python code blocks (```python ... ```) + python_pattern = r"```python\s*([\s\S]*?)\s*```" + python_matches = re.findall(python_pattern, text) + + if python_matches: + # Join all Python code blocks + return "\n\n".join(python_matches) + + # If no Python blocks found, look for generic code blocks (``` ... ```) + generic_pattern = r"```\s*([\s\S]*?)\s*```" + generic_matches = re.findall(generic_pattern, text) + + if generic_matches: + # Join all generic code blocks + return "\n\n".join(generic_matches) + + # No code blocks found + return "" + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(2)) + async def __call__(self, problem: str, analysis: str = "None"): + """ + Call method, generate code and execute, retry up to 3 times. + """ + code = None + output = None + feedback = "" + for i in range(3): + code_response = await self.code_generate(problem, analysis, feedback) + code = code_response.get("code") + if not code: + return {"code": code, "output": "No code generated"} + status, output = await self.exec_code(code) + if status == "Success": + return {"code": code, "output": output} + else: + print(f"Execution error on attempt {i + 1}, error message: {output}") + feedback = ( + f"\nThe result of the error from the code you wrote in the previous round:\n" + f"Code: {code}\n\nStatus: {status}, {output}" + ) + return {"code": code, "output": output} + + +class ScEnsemble(Operator): + """ + Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models + Link: https://arxiv.org/abs/2203.11171 + Paper: Universal Self-Consistency for Large Language Model Generation + Link: https://arxiv.org/abs/2311.17311 + """ + + def __init__(self, llm, name: str = "ScEnsemble"): + super().__init__(llm, name) + + async def __call__(self, solutions: List[str], problem: str): + answer_mapping = {} + solution_text = "" + for index, solution in enumerate(solutions): + answer_mapping[chr(65 + index)] = index + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" + + prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text) + field_names=["solution_letter"] + examples = [] + for field_name in field_names: + examples.append(f"<{field_name}>The letter of most consistent solution.") + example_str = "\n".join(examples) + prompt = prompt + f"\n# Response format (must be strictly followed) (do not include any other formats except for the given XML format):\n{example_str}" + # types={"solution_letter":str} + if self.llm.inference_flag: + response = await self.llm.call_llm(prompt=prompt) + else: + response = await self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + # response=self.llm.xml_extract(response,field_names,types) + + try: + pattern = r"<(\w+)>(.*?)" + matches = re.findall(pattern, response, re.DOTALL) + found_fields = {match[0]: match[1].strip() for match in matches} + except: + pass + if isinstance(found_fields, dict): + response = found_fields + else: + response = {"response": response} + + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + return {"response": solutions[answer_mapping[answer]]} \ No newline at end of file diff --git a/methods/aflow/initial_workflows/math/template/operator.json b/methods/aflow/initial_workflows/math/template/operator.json new file mode 100644 index 0000000..1a57ccd --- /dev/null +++ b/methods/aflow/initial_workflows/math/template/operator.json @@ -0,0 +1,14 @@ +{ + "Custom": { + "description": "Generates anything based on customized input and instruction.", + "interface": "custom(input: str, instruction: str) -> dict with key 'response' of type str" + }, + "ScEnsemble": { + "description": "Uses self-consistency to select the solution that appears most frequently in the solution list, improve the selection to enhance the choice of the best solution.", + "interface": "sc_ensemble(solutions: List[str], problem: str) -> dict with key 'response' of type str" + }, + "Programmer": { + "description": "Automatically writes, executes Python code, and returns the solution based on the provided problem description and analysis. The `output` only contains the final answer. If you want to see the detailed solution process, it's recommended to retrieve the `code`.", + "interface": "programmer(problem: str, analysis: str = 'None') -> dict with keys 'code' and 'output' of type str" + } +} diff --git a/methods/aflow/initial_workflows/math/template/operator.py b/methods/aflow/initial_workflows/math/template/operator.py new file mode 100644 index 0000000..ae5d7f8 --- /dev/null +++ b/methods/aflow/initial_workflows/math/template/operator.py @@ -0,0 +1,225 @@ +import concurrent +import io +import sys +import re +import traceback +from typing import List +from tenacity import retry, stop_after_attempt, wait_fixed + +from .op_prompt import * +from .sanitize import * +import asyncio + +class Operator: + def __init__(self, llm, name: str): + self.name = name + self.llm = llm + + def __call__(self, *args, **kwargs): + raise NotImplementedError + +class Custom(Operator): + def __init__(self, llm, name: str = "Custom"): + super().__init__(llm, name) + + async def __call__(self, input, instruction): + prompt = instruction + input + if self.llm.inference_flag: + response = await self.llm.async_call_llm(prompt=prompt) + else: + response = await self.llm.async_call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + return {"response":response} + +def run_code(code): + try: + # Create a new global namespace + global_namespace = {} + + disallowed_imports = [ + "os", "sys", "subprocess", "multiprocessing", + "matplotlib", "seaborn", "plotly", "bokeh", "ggplot", + "pylab", "tkinter", "PyQt5", "wx", "pyglet" + ] + + # Check for prohibited imports + for lib in disallowed_imports: + if f"import {lib}" in code or f"from {lib}" in code: + + return "Error", f"Prohibited import: {lib} and graphing functionalities" + + # Use exec to execute the code + exec(code, global_namespace) + # Assume the code defines a function named 'solve' + if 'solve' in global_namespace and callable(global_namespace['solve']): + result = global_namespace['solve']() + return "Success", str(result) + else: + return "Error", "Function 'solve' not found" + except Exception as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) + return "Error", f"Execution error: {str(e)}\n{''.join(tb_str)}" + + +class Programmer(Operator): + def __init__(self, llm, name: str = "Programmer"): + super().__init__(llm, name) + + async def exec_code(self, code, timeout=30): + """ + Asynchronously execute code and return an error if timeout occurs. + """ + loop = asyncio.get_running_loop() + with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: + try: + # Submit run_code task to the process pool + future = loop.run_in_executor(executor, run_code, code) + # Wait for the task to complete or timeout + result = await asyncio.wait_for(future, timeout=timeout) + return result + except asyncio.TimeoutError: + # Timeout, attempt to shut down the process pool + executor.shutdown(wait=False, cancel_futures=True) + return "Error", "Code execution timed out" + except Exception as e: + return "Error", f"Unknown error: {str(e)}" + + async def code_generate(self, problem, analysis, feedback): + """ + Asynchronous method to generate code. + """ + prompt = PYTHON_CODE_VERIFIER_PROMPT.format( + problem=problem, + analysis=analysis, + feedback=feedback + ) + try: + if self.llm.inference_flag: + response = await self.llm.async_call_llm(prompt=prompt) + else: + response = await self.llm.async_call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + + code_pattern = r"\s*(.*?)\s*" + match = re.search(code_pattern, response, re.DOTALL) + if match: + code = match.group(1).strip() + code = re.sub(r"^```(?:\w+)?\n?|```$", "", code, flags=re.MULTILINE).strip() + if not code: + code = "" + + # If no code blocks found, treat the entire response as code + #if not code: + #code = response + + # Use the sanitize function to extract valid code and handle dependencies + #sanitized_code = sanitize(code=code, entrypoint=None) + + # If sanitize returned empty string, the code is invalid + #if not sanitized_code.strip(): + #response = None + + # Return the sanitized code + response = {"code": code} + + except Exception as e: + response = {"error": str(e)} + + return response + + def _extract_code_from_markdown(self, text: str) -> str: + """ + Extract code from markdown code blocks in the response. + + Args: + text: The text containing possible markdown code blocks + + Returns: + The extracted code as a string, or empty string if no code blocks found + """ + # Look for Python code blocks (```python ... ```) + python_pattern = r"```python\s*([\s\S]*?)\s*```" + python_matches = re.findall(python_pattern, text) + + if python_matches: + # Join all Python code blocks + return "\n\n".join(python_matches) + + # If no Python blocks found, look for generic code blocks (``` ... ```) + generic_pattern = r"```\s*([\s\S]*?)\s*```" + generic_matches = re.findall(generic_pattern, text) + + if generic_matches: + # Join all generic code blocks + return "\n\n".join(generic_matches) + + # No code blocks found + return "" + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(2)) + async def __call__(self, problem: str, analysis: str = "None"): + """ + Call method, generate code and execute, retry up to 3 times. + """ + code = None + output = None + feedback = "" + for i in range(3): + code_response = await self.code_generate(problem, analysis, feedback) + code = code_response.get("code") + if not code: + return {"code": code, "output": "No code generated"} + status, output = await self.exec_code(code) + if status == "Success": + return {"code": code, "output": output} + else: + print(f"Execution error on attempt {i + 1}, error message: {output}") + feedback = ( + f"\nThe result of the error from the code you wrote in the previous round:\n" + f"Code: {code}\n\nStatus: {status}, {output}" + ) + return {"code": code, "output": output} + + +class ScEnsemble(Operator): + """ + Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models + Link: https://arxiv.org/abs/2203.11171 + Paper: Universal Self-Consistency for Large Language Model Generation + Link: https://arxiv.org/abs/2311.17311 + """ + + def __init__(self, llm, name: str = "ScEnsemble"): + super().__init__(llm, name) + + async def __call__(self, solutions: List[str], problem: str): + answer_mapping = {} + solution_text = "" + for index, solution in enumerate(solutions): + answer_mapping[chr(65 + index)] = index + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" + + prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text) + + prompt = prompt + f"\n# Response format (must be strictly followed) (do not include any other formats except for the given XML format):\nThe letter of most consistent solution." + + if self.llm.inference_flag: + response = await self.llm.async_call_llm(prompt=prompt) + else: + response = await self.llm.async_call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + + try: + pattern = r"<(\w+)>(.*?)" + matches = re.findall(pattern, response, re.DOTALL) + found_fields = {match[0]: match[1].strip() for match in matches} + except: + pass + if isinstance(found_fields, dict): + response = found_fields + else: + response = {"response": response} + + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + return {"response": solutions[answer_mapping[answer]]} \ No newline at end of file diff --git a/methods/aflow/initial_workflows/math/template/sanitize.py b/methods/aflow/initial_workflows/math/template/sanitize.py new file mode 100644 index 0000000..811771d --- /dev/null +++ b/methods/aflow/initial_workflows/math/template/sanitize.py @@ -0,0 +1,177 @@ + +import ast +import traceback +from enum import Enum +from typing import Dict, Generator, List, Optional, Set, Tuple + +import tree_sitter_python +from tree_sitter import Language, Node, Parser + + +class NodeType(Enum): + CLASS = "class_definition" + FUNCTION = "function_definition" + IMPORT = ["import_statement", "import_from_statement"] + IDENTIFIER = "identifier" + ATTRIBUTE = "attribute" + RETURN = "return_statement" + EXPRESSION = "expression_statement" + ASSIGNMENT = "assignment" + + +def traverse_tree(node: Node) -> Generator[Node, None, None]: + """ + Traverse the tree structure starting from the given node. + + :param node: The root node to start the traversal from. + :return: A generator object that yields nodes in the tree. + """ + cursor = node.walk() + depth = 0 + + visited_children = False + while True: + if not visited_children: + yield cursor.node + if not cursor.goto_first_child(): + depth += 1 + visited_children = True + elif cursor.goto_next_sibling(): + visited_children = False + elif not cursor.goto_parent() or depth == 0: + break + else: + depth -= 1 + + +def syntax_check(code, verbose=False): + try: + ast.parse(code) + return True + except (SyntaxError, MemoryError): + if verbose: + traceback.print_exc() + return False + + +def code_extract(text: str) -> str: + lines = text.split("\n") + longest_line_pair = (0, 0) + longest_so_far = 0 + + for i in range(len(lines)): + for j in range(i + 1, len(lines)): + current_lines = "\n".join(lines[i : j + 1]) + if syntax_check(current_lines): + current_length = sum(1 for line in lines[i : j + 1] if line.strip()) + if current_length > longest_so_far: + longest_so_far = current_length + longest_line_pair = (i, j) + + return "\n".join(lines[longest_line_pair[0] : longest_line_pair[1] + 1]) + + +def get_definition_name(node: Node) -> str: + for child in node.children: + if child.type == NodeType.IDENTIFIER.value: + return child.text.decode("utf8") + + +def has_return_statement(node: Node) -> bool: + traverse_nodes = traverse_tree(node) + for node in traverse_nodes: + if node.type == NodeType.RETURN.value: + return True + return False + + +def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]: + def dfs_get_deps(node: Node, deps: Set[str]) -> None: + for child in node.children: + if child.type == NodeType.IDENTIFIER.value: + deps.add(child.text.decode("utf8")) + else: + dfs_get_deps(child, deps) + + name2deps = {} + for name, node in nodes: + deps = set() + dfs_get_deps(node, deps) + name2deps[name] = deps + return name2deps + + +def get_function_dependency(entrypoint: str, call_graph: Dict[str, str]) -> Set[str]: + queue = [entrypoint] + visited = {entrypoint} + while queue: + current = queue.pop(0) + if current not in call_graph: + continue + for neighbour in call_graph[current]: + if neighbour not in visited: + visited.add(neighbour) + queue.append(neighbour) + return visited + + +def sanitize(code: str, entrypoint: Optional[str] = None) -> str: + """ + Sanitize and extract relevant parts of the given Python code. + This function parses the input code, extracts import statements, class and function definitions, + and variable assignments. If an entrypoint is provided, it only includes definitions that are + reachable from the entrypoint in the call graph. + + :param code: The input Python code as a string. + :param entrypoint: Optional name of a function to use as the entrypoint for dependency analysis. + :return: A sanitized version of the input code, containing only relevant parts. + """ + code = code_extract(code) + code_bytes = bytes(code, "utf8") + parser = Parser(Language(tree_sitter_python.language())) + tree = parser.parse(code_bytes) + class_names = set() + function_names = set() + variable_names = set() + + root_node = tree.root_node + import_nodes = [] + definition_nodes = [] + + for child in root_node.children: + if child.type in NodeType.IMPORT.value: + import_nodes.append(child) + elif child.type == NodeType.CLASS.value: + name = get_definition_name(child) + if not (name in class_names or name in variable_names or name in function_names): + definition_nodes.append((name, child)) + class_names.add(name) + elif child.type == NodeType.FUNCTION.value: + name = get_definition_name(child) + if not (name in function_names or name in variable_names or name in class_names) and has_return_statement( + child + ): + definition_nodes.append((name, child)) + function_names.add(get_definition_name(child)) + elif child.type == NodeType.EXPRESSION.value and child.children[0].type == NodeType.ASSIGNMENT.value: + subchild = child.children[0] + name = get_definition_name(subchild) + if not (name in variable_names or name in function_names or name in class_names): + definition_nodes.append((name, subchild)) + variable_names.add(name) + + if entrypoint: + name2deps = get_deps(definition_nodes) + reacheable = get_function_dependency(entrypoint, name2deps) + + sanitized_output = b"" + + for node in import_nodes: + sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + + for pair in definition_nodes: + name, node = pair + if entrypoint and name not in reacheable: + continue + sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + return sanitized_output[:-1].decode("utf8") diff --git a/methods/aflow/initial_workflows/mbpp/__init__.py b/methods/aflow/initial_workflows/mbpp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/mbpp/round_1/__init__.py b/methods/aflow/initial_workflows/mbpp/round_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/mbpp/round_1/graph.py b/methods/aflow/initial_workflows/mbpp/round_1/graph.py new file mode 100644 index 0000000..0217ad8 --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/round_1/graph.py @@ -0,0 +1,22 @@ +from typing import Literal +from ..template import operator +from ..round_1 import prompt as prompt_custom + +DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] + +class Workflow: + def __init__(self,name: str,env) -> None: + self.name = name + self.llm=env + self.custom = operator.Custom(self.llm) + self.custom_code_generate = operator.CustomCodeGenerate(self.llm) + + async def __call__(self, problem: str, entry_point: str): + """ + Implementation of the workflow + Custom operator to generate anything you want. + But when you want to get standard code, you should use custom_code_generate operator. + """ + # await self.custom(input=, instruction="") + solution = await self.custom_code_generate(problem=problem, entry_point=entry_point, instruction="") # But When you want to get standard code ,you should use customcodegenerator. + return solution['response'] diff --git a/methods/aflow/initial_workflows/mbpp/round_1/log.json b/methods/aflow/initial_workflows/mbpp/round_1/log.json new file mode 100644 index 0000000..e69de29 diff --git a/methods/aflow/initial_workflows/mbpp/round_1/prompt.py b/methods/aflow/initial_workflows/mbpp/round_1/prompt.py new file mode 100644 index 0000000..310e53d --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/round_1/prompt.py @@ -0,0 +1,6 @@ +XXX_PROMPT = """ + +Solve it. + +""" + diff --git a/methods/aflow/initial_workflows/mbpp/template/op_prompt.py b/methods/aflow/initial_workflows/mbpp/template/op_prompt.py new file mode 100644 index 0000000..9653eea --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/template/op_prompt.py @@ -0,0 +1,27 @@ +SC_ENSEMBLE_PROMPT = """ +Given the question described as follows: {problem} +Several solutions have been generated to address the given question. They are as follows: +{solutions} + +Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution. + +In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field. +""" + + +REFLECTION_ON_PUBLIC_TEST_PROMPT = """ +Given a code problem and a python code solution which failed to pass test or execute, you need to analyze the reason for the failure and propose a better code solution.: +### problem +{problem} + +### Code Solution +{solution} + +### Execution Result +{exec_pass} + +#### Failed Test Case +{test_fail} + +Please provide a reflection on the failed test cases and code solution, followed by a better code solution without any additional text or test cases. +""" \ No newline at end of file diff --git a/methods/aflow/initial_workflows/mbpp/template/operator.json b/methods/aflow/initial_workflows/mbpp/template/operator.json new file mode 100644 index 0000000..5c3234e --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/template/operator.json @@ -0,0 +1,18 @@ +{ + "Custom": { + "description": "Generates anything based on customized input and instruction.", + "interface": "custom(input: str, instruction: str) -> dict with key 'response' of type str" + }, + "CustomCodeGenerate": { + "description": "Generates code based on customized input and instruction.", + "interface": "custom_code_generate(problem: str, entry_point: str, instruction: str) -> dict with key 'response' of type str" + }, + "ScEnsemble": { + "description": "Uses self-consistency to select the solution that appears most frequently in the solution list, improve the selection to enhance the choice of the best solution.", + "interface": "sc_ensemble(solutions: List[str], problem: str) -> dict with key 'response' of type str" + }, + "Test": { + "description": "Tests the solution using public test cases. If the solution fails, it reflects on the errors and attempts to modify the solution. Returns True and the solution if all tests pass after modifications. Returns False and the current solution if it still fails after modifications.", + "interface": "test(problem: str, solution: str, entry_point: str) -> dict with key 'result' of type bool and key 'solution' of type str" + } +} diff --git a/methods/aflow/initial_workflows/mbpp/template/operator.py b/methods/aflow/initial_workflows/mbpp/template/operator.py new file mode 100644 index 0000000..049a32a --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/template/operator.py @@ -0,0 +1,205 @@ +import sys +import json +import traceback + +from typing import List + +from .op_prompt import * +from .sanitize import * + +class Operator: + def __init__(self, llm, name: str): + self.name = name + self.llm = llm + + def __call__(self, *args, **kwargs): + raise NotImplementedError + +class Custom(Operator): + def __init__(self, llm, name: str = "Custom"): + super().__init__(llm, name) + + async def __call__(self, input, instruction): + prompt = instruction + input + if self.llm.inference_flag: + response = self.llm.call_llm(prompt=prompt) + else: + response = self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + return {"response":response} + + +class CustomCodeGenerate(Operator): + def __init__(self, llm, name: str = "CustomCodeGenerate"): + super().__init__(llm, name) + + async def __call__(self, problem, entry_point, instruction): + prompt = instruction + problem + if self.llm.inference_flag: + response = self.llm.call_llm(prompt=prompt) + else: + response = self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + extracted_code = sanitize(code=response, entrypoint=entry_point) + return {"response":extracted_code} + +class ScEnsemble(Operator): + """ + Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models + Link: https://arxiv.org/abs/2203.11171 + Paper: Universal Self-Consistency for Large Language Model Generation + Link: https://arxiv.org/abs/2311.17311 + """ + + def __init__(self, llm, name: str = "ScEnsemble"): + super().__init__(llm, name) + + async def __call__(self, solutions: List[str], problem: str): + answer_mapping = {} + solution_text = "" + for index, solution in enumerate(solutions): + answer_mapping[chr(65 + index)] = index + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" + + prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text) + field_names=["thought","solution_letter"] + examples = [] + for field_name in field_names: + examples.append(f"<{field_name}>content") + example_str = "\n".join(examples) + prompt += f""" +### Response format (must be strictly followed): All content must be enclosed in the given XML tags, ensuring each opening has a corresponding closing , with no incomplete or self-closing tags allowed.\n +{example_str} +""" + names=["thought","solution_letter"] + types={"thought":str,"solution_letter":str} + if self.llm.inference_flag: + response = self.llm.call_llm(prompt=prompt) + else: + response = self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + response=self.llm.xml_extract(response,names,types) + response={"solution_letter":response} + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + return {"response": solutions[answer_mapping[answer]]} + +class Test(Operator): + def __init__(self, llm, name: str = "Test"): + super().__init__(llm, name) + + def exec_code(self, solution, entry_point): + + test_cases = extract_test_cases_from_jsonl(entry_point, dataset="MBPP") + + fail_cases = [] + for test_case in test_cases: + test_code = test_case_2_test_function(solution, test_case, entry_point) + try: + exec(test_code, globals()) + except AssertionError as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) + with open("tester.txt", "a") as f: + f.write("test_error of " + entry_point + "\n") + error_infomation = { + "test_fail_case": { + "test_case": test_case, + "error_type": "AssertionError", + "error_message": str(e), + "traceback": tb_str, + } + } + fail_cases.append(error_infomation) + except Exception as e: + with open("tester.txt", "a") as f: + f.write(entry_point + " " + str(e) + "\n") + return {"exec_fail_case": str(e)} + if fail_cases != []: + return fail_cases + else: + return "no error" + + async def __call__( + self, problem, solution, entry_point, test_loop: int = 3 + ): + """ + "Test": { + "description": "Test the solution with test cases, if the solution is correct, return 'no error', if the solution is incorrect, return reflect on the soluion and the error information", + "interface": "test(problem: str, solution: str, entry_point: str) -> str" + } + """ + for _ in range(test_loop): + result = self.exec_code(solution, entry_point) + if result == "no error": + return {"result": True, "solution": solution} + elif "exec_fail_case" in result: + result = result["exec_fail_case"] + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem=problem, + solution=solution, + exec_pass=f"executed unsuccessfully, error: \n {result}", + test_fail="executed unsucessfully", + ) + if self.llm.inference_flag: + response = self.llm.call_llm(prompt=prompt) + else: + response = self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + solution = sanitize(code=response, entrypoint=entry_point) + + else: + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem=problem, + solution=solution, + exec_pass="executed successfully", + test_fail=result, + ) + if self.llm.inference_flag: + response = self.llm.call_llm(prompt=prompt) + else: + response = self.llm.call_llm(prompt=prompt,model_name=self.llm.model_name_execute) + solution = sanitize(code=response, entrypoint=entry_point) + + result = self.exec_code(solution, entry_point) + if result == "no error": + return {"result": True, "solution": solution} + else: + return {"result": False, "solution": solution} + +def extract_test_cases_from_jsonl(entry_point: str): + file_path = "/MAS-LLM/datasets/data/aflow_mbpp_test.json" + hardcoded_cases = { + "remove_odd": "", + "replace_spaces": "", + "snake_to_camel": "", + "Split": "", + "swap_List": "", + "square_Sum": "", + "sort_sublists": "", + "unique_sublists": "", + } + # Check if there are hardcoded test cases + if entry_point in hardcoded_cases: + return hardcoded_cases[entry_point] + + # If there are no hardcoded test cases, read from the file + with open(file_path, "r") as file: + data = json.load(file) + for item in data: + if item.get("entry_point") == entry_point: + return item.get("test") + + return None + +def test_case_2_test_function(solution: str, test_case: str, entry_point: str): + tester_function = f""" +{solution} + + +def check(candidate): + {test_case} + +def test_check(): + check({entry_point}) + +test_check() +""" + return tester_function diff --git a/methods/aflow/initial_workflows/mbpp/template/sanitize.py b/methods/aflow/initial_workflows/mbpp/template/sanitize.py new file mode 100644 index 0000000..811771d --- /dev/null +++ b/methods/aflow/initial_workflows/mbpp/template/sanitize.py @@ -0,0 +1,177 @@ + +import ast +import traceback +from enum import Enum +from typing import Dict, Generator, List, Optional, Set, Tuple + +import tree_sitter_python +from tree_sitter import Language, Node, Parser + + +class NodeType(Enum): + CLASS = "class_definition" + FUNCTION = "function_definition" + IMPORT = ["import_statement", "import_from_statement"] + IDENTIFIER = "identifier" + ATTRIBUTE = "attribute" + RETURN = "return_statement" + EXPRESSION = "expression_statement" + ASSIGNMENT = "assignment" + + +def traverse_tree(node: Node) -> Generator[Node, None, None]: + """ + Traverse the tree structure starting from the given node. + + :param node: The root node to start the traversal from. + :return: A generator object that yields nodes in the tree. + """ + cursor = node.walk() + depth = 0 + + visited_children = False + while True: + if not visited_children: + yield cursor.node + if not cursor.goto_first_child(): + depth += 1 + visited_children = True + elif cursor.goto_next_sibling(): + visited_children = False + elif not cursor.goto_parent() or depth == 0: + break + else: + depth -= 1 + + +def syntax_check(code, verbose=False): + try: + ast.parse(code) + return True + except (SyntaxError, MemoryError): + if verbose: + traceback.print_exc() + return False + + +def code_extract(text: str) -> str: + lines = text.split("\n") + longest_line_pair = (0, 0) + longest_so_far = 0 + + for i in range(len(lines)): + for j in range(i + 1, len(lines)): + current_lines = "\n".join(lines[i : j + 1]) + if syntax_check(current_lines): + current_length = sum(1 for line in lines[i : j + 1] if line.strip()) + if current_length > longest_so_far: + longest_so_far = current_length + longest_line_pair = (i, j) + + return "\n".join(lines[longest_line_pair[0] : longest_line_pair[1] + 1]) + + +def get_definition_name(node: Node) -> str: + for child in node.children: + if child.type == NodeType.IDENTIFIER.value: + return child.text.decode("utf8") + + +def has_return_statement(node: Node) -> bool: + traverse_nodes = traverse_tree(node) + for node in traverse_nodes: + if node.type == NodeType.RETURN.value: + return True + return False + + +def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]: + def dfs_get_deps(node: Node, deps: Set[str]) -> None: + for child in node.children: + if child.type == NodeType.IDENTIFIER.value: + deps.add(child.text.decode("utf8")) + else: + dfs_get_deps(child, deps) + + name2deps = {} + for name, node in nodes: + deps = set() + dfs_get_deps(node, deps) + name2deps[name] = deps + return name2deps + + +def get_function_dependency(entrypoint: str, call_graph: Dict[str, str]) -> Set[str]: + queue = [entrypoint] + visited = {entrypoint} + while queue: + current = queue.pop(0) + if current not in call_graph: + continue + for neighbour in call_graph[current]: + if neighbour not in visited: + visited.add(neighbour) + queue.append(neighbour) + return visited + + +def sanitize(code: str, entrypoint: Optional[str] = None) -> str: + """ + Sanitize and extract relevant parts of the given Python code. + This function parses the input code, extracts import statements, class and function definitions, + and variable assignments. If an entrypoint is provided, it only includes definitions that are + reachable from the entrypoint in the call graph. + + :param code: The input Python code as a string. + :param entrypoint: Optional name of a function to use as the entrypoint for dependency analysis. + :return: A sanitized version of the input code, containing only relevant parts. + """ + code = code_extract(code) + code_bytes = bytes(code, "utf8") + parser = Parser(Language(tree_sitter_python.language())) + tree = parser.parse(code_bytes) + class_names = set() + function_names = set() + variable_names = set() + + root_node = tree.root_node + import_nodes = [] + definition_nodes = [] + + for child in root_node.children: + if child.type in NodeType.IMPORT.value: + import_nodes.append(child) + elif child.type == NodeType.CLASS.value: + name = get_definition_name(child) + if not (name in class_names or name in variable_names or name in function_names): + definition_nodes.append((name, child)) + class_names.add(name) + elif child.type == NodeType.FUNCTION.value: + name = get_definition_name(child) + if not (name in function_names or name in variable_names or name in class_names) and has_return_statement( + child + ): + definition_nodes.append((name, child)) + function_names.add(get_definition_name(child)) + elif child.type == NodeType.EXPRESSION.value and child.children[0].type == NodeType.ASSIGNMENT.value: + subchild = child.children[0] + name = get_definition_name(subchild) + if not (name in variable_names or name in function_names or name in class_names): + definition_nodes.append((name, subchild)) + variable_names.add(name) + + if entrypoint: + name2deps = get_deps(definition_nodes) + reacheable = get_function_dependency(entrypoint, name2deps) + + sanitized_output = b"" + + for node in import_nodes: + sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + + for pair in definition_nodes: + name, node = pair + if entrypoint and name not in reacheable: + continue + sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + return sanitized_output[:-1].decode("utf8") diff --git a/methods/gptswarm/__init__.py b/methods/gptswarm/__init__.py new file mode 100644 index 0000000..9a5c2a5 --- /dev/null +++ b/methods/gptswarm/__init__.py @@ -0,0 +1 @@ +from .gptswarm_math import GPTswarm_MATH \ No newline at end of file diff --git a/methods/gptswarm/agents.py b/methods/gptswarm/agents.py new file mode 100644 index 0000000..da370c3 --- /dev/null +++ b/methods/gptswarm/agents.py @@ -0,0 +1,570 @@ +import asyncio +import os +import random +import re +import shortuuid +import warnings + +from copy import deepcopy +from collections import Counter +from pytube import YouTube +from termcolor import colored +from typing import Any,List,Optional,Dict + +from .prompt import * + +random.seed(0) +class BaseAgent(): + def __init__(self): + self.id = shortuuid.ShortUUID().random(length=4) + self.nodes = {} + self.input_nodes: List[Node] = [] + self.output_nodes: List[Node] = [] + + def add_node(self, node): + """ + Creates and adds a new node to the graph. + If id is not provided, generates a unique id for the node. + """ + node_id = node.id if node.id is not None else shortuuid.ShortUUID().random(length=4) + while node_id in self.nodes: + node_id = shortuuid.ShortUUID().random(length=5) + node.id = node_id + self.nodes[node_id] = node + return node + +class IO(BaseAgent): + def __init__(self,env): + super().__init__() + io = DirectAnswer(env=env) + self.add_node(io) + self.input_nodes = [io] + self.output_nodes = [io] + +class IO_MATH(BaseAgent): + def __init__(self,env): + super().__init__() + io = DirectAnswer_MATH(env=env) + self.add_node(io) + self.input_nodes = [io] + self.output_nodes = [io] + +class CodeReact(BaseAgent): + def __init__(self,env,num_reacts: int = 1): + self.num_reacts = num_reacts + super().__init__() + code_writing = CodeWriting(env=env) + self.add_node(code_writing) + last_node = code_writing + for _ in range(self.num_reacts): + code_rewrite = CodeWriting(env=env) + last_node.add_successor(code_rewrite) + last_node = code_rewrite + self.add_node(code_rewrite) + + self.input_nodes = [code_writing] + self.output_nodes = [code_rewrite] + + def run(self, inputs: Dict[str, Any], max_tries: int = 3, max_time: int = 600) -> List[Any]: + def is_node_useful(node): + if node in self.output_nodes: + return True + for successor in node.successors: + if is_node_useful(successor): + return True + return False + + useful_node_ids = [node_id for node_id, node in self.nodes.items() if is_node_useful(node)] + in_degree = {node_id: len(self.nodes[node_id].predecessors) for node_id in useful_node_ids} + zero_in_degree_queue = [node_id for node_id, deg in in_degree.items() if deg == 0 and node_id in useful_node_ids] + + for _ , input_node in enumerate(self.input_nodes): + node_input = deepcopy(inputs) + input_node.inputs = [node_input] + + while zero_in_degree_queue: + current_node_id = zero_in_degree_queue.pop(0) + current_node = self.nodes[current_node_id] + tries = 0 + while tries < max_tries: + try: + asyncio.run(self.nodes[current_node_id].execute()) + break + except asyncio.TimeoutError: + print(f"Node {current_node_id} execution timed out, retrying {tries + 1} out of {max_tries}...") + except Exception as e: + print(f"Error during execution of node {current_node_id}: {e}") + break + tries += 1 + + for successor in current_node.successors: + if successor.id in useful_node_ids: + in_degree[successor.id] -= 1 + if in_degree[successor.id] == 0: + zero_in_degree_queue.append(successor.id) + + final_answers = [] + + for output_node in self.output_nodes: + output_messages = output_node.outputs + if len(output_messages) > 0: + final_answer = output_messages[-1].get("output", output_messages[-1]) + final_answers.append(final_answer) + else: + for output_message in output_messages: + final_answer = output_message.get("output", output_message) + final_answers.append(final_answer) + + if len(final_answers) == 0: + final_answers.append("No answer since there are no inputs provided") + return final_answers +''' +class ToolTOT(BaseAgent): + def __init__(self,env): + super().__init__() + query = GenerateQuery(self.domain, self.model_name) + + file_analysis = FileAnalyse(self.domain, self.model_name) + web_search = WebSearch(self.domain, self.model_name) + + query.add_successor(file_analysis) + query.add_successor(web_search) + + combine = CombineAnswer(self.domain, self.model_name) + file_analysis.add_successor(combine) + web_search.add_successor(combine) + + self.input_nodes = [query] + self.output_nodes = [combine] + + self.add_node(query) + self.add_node(file_analysis) + self.add_node(web_search) + self.add_node(combine) +''' + +class Node(): + def __init__(self, operation_description: str, id: Optional[str], combine_inputs_as_one: bool,env,domenstrations=None): + self.id = id if id is not None else shortuuid.ShortUUID().random(length=4) + self.operation_description = operation_description + self.predecessors: List[Node] = [] + self.successors: List[Node] = [] + self.inputs: List[Any] = [] + self.outputs: List[Any] = [] + self.domenstrations = domenstrations if domenstrations else [] + self.combine_inputs_as_one = combine_inputs_as_one + self.env = env + + def add_predecessor(self, operation: 'Node'): + if operation not in self.predecessors: + self.predecessors.append(operation) + operation.successors.append(self) + + def add_successor(self, operation: 'Node'): + if operation not in self.successors: + self.successors.append(operation) + operation.predecessors.append(self) + + def process_input(self, inputs): + all_inputs = [] + if inputs is None: + if self.predecessors: + + for predecessor in self.predecessors: + predecessor_input = self.env.memory.get(predecessor.id, []) + if isinstance(predecessor_input, list) and predecessor_input: + predecessor_input = predecessor_input[-1] + all_inputs.append(predecessor_input) + inputs = all_inputs + else: + raise ValueError("Input must be provided either directly or from predecessors.") + + elif not isinstance(inputs, list): + inputs = [inputs] + + return inputs + + async def execute(self, **kwargs): + self.outputs = [] + tasks = [] + # 1.Create tasks + if not self.inputs and self.predecessors: + if self.combine_inputs_as_one: + combined_inputs = [] + for predecessor in self.predecessors: + predecessor_outputs = predecessor.outputs + if predecessor_outputs is not None and isinstance(predecessor_outputs, list): + combined_inputs.extend(predecessor_outputs) + tasks.append(asyncio.create_task(self._execute(combined_inputs, **kwargs))) + else: + for predecessor in self.predecessors: + predecessor_outputs = predecessor.outputs + if isinstance(predecessor_outputs, list) and predecessor_outputs: + for predecessor_output in predecessor_outputs: + tasks.append(asyncio.create_task(self._execute(predecessor_output, **kwargs))) + # There is direct input + elif self.inputs: + tasks = [asyncio.create_task(self._execute(input, **kwargs)) for input in self.inputs] + else: + warnings.warn("No input received.") + return + + # 2.Perform tasks + if tasks: + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if not isinstance(result, Exception): + if not isinstance(result, list): + result = [result] + self.outputs.extend(result) + else: + print(colored(f"Node {type(self).__name__} failed to execute due to: {result.__class__.__name__}: {result}","light_red")) + +class FinalDecision(Node): + def __init__(self, operation_description: str = "Refer to all answers and give a final answer.",id=None,env=None): + super().__init__(operation_description, id, True,env) + + async def _execute(self, inputs: List[Any] = [], **kwargs) -> None: + prompt = None + response = None + if len(inputs) == 0: + raise Exception("No inputs is not supported for MajorityVote") + answers = [input.get("output") for input in inputs] + counter = Counter(answers) + sorted_counter = counter.most_common() + max_freq = sorted_counter[0][1] + equally_frequent_answers = [ans for ans, freq in sorted_counter if freq == max_freq] + response = random.choice(equally_frequent_answers) + # print(colored(f"{answers=} {response=}","blue")) + + executions = { + "task": inputs[0]["task"], + "files": inputs[0]["files"], + "input": inputs, + "subtask": prompt, + "output": response, + "format": "natural language"} + + if self.id not in self.env.memory: + self.env.memory[self.id] = [] + self.env.memory[self.id].append(executions) + + return executions + +class DirectAnswer(Node): + def __init__(self, operation_description: str = "Directly output an answer.",id=None,env=None): + super().__init__(operation_description, id, True,env) + + async def _execute(self, inputs: List[Any] = [], **kwargs): + + node_inputs = self.process_input(inputs) + outputs = [] + + for input in node_inputs: + task = input["task"] + messages = [{"role":"system", "content": MMLU_SYSTEM_PROMPT}, + {"role":"user","content":task}] + if self.env.inference_flag: + response = self.env.call_llm(messages=messages) + else: + response = self.env.call_llm(messages=messages,model_name=self.env.model_name_execute) + + execution = { + "task": task, + "files": input.get("files", []), + "input": task, + "output": response, + "ground_truth": input.get("GT", []), + "format": "natural language" + } + outputs.append(execution) + if self.id not in self.env.memory: + self.env.memory[self.id] = [] + self.env.memory[self.id].append(execution) + + return outputs + +class DirectAnswer_MATH(Node): + def __init__(self, operation_description: str = "Directly output an answer.",id=None,env=None): + super().__init__(operation_description, id, True,env) + + async def _execute(self, inputs: List[Any] = [], **kwargs): + + node_inputs = self.process_input(inputs) + outputs = [] + + for input in node_inputs: + task = input["task"] + messages = [{"role":"system", "content": MATH_SYSTEM_PROMPT}, + {"role":"user","content":task}] + if self.env.inference_flag: + response = self.env.call_llm(messages=messages) + else: + response = self.env.call_llm(messages=messages,model_name=self.env.model_name_execute) + + execution = { + "task": task, + "files": input.get("files", []), + "input": task, + "output": response, + "ground_truth": input.get("GT", []), + "format": "natural language" + } + outputs.append(execution) + if self.id not in self.env.memory: + self.env.memory[self.id] = [] + self.env.memory[self.id].append(execution) + + return outputs + +class CodeWriting(Node): + def __init__( + self, + env, + operation_description: str = "a Python code generator", + id=None, + prompt=None, + domenstrations=None + ): + super().__init__(operation_description, id, False,env,domenstrations) + self.prompt = prompt if prompt else CODE_PROMPT + self.max_domenstrations = 4 + + def extract_example(self, prompt: str) -> list: + lines = (line.strip() for line in prompt.split('\n') if line.strip()) + + results = [] + lines_iter = iter(lines) + for line in lines_iter: + if line.startswith('>>>'): + function_call = line[4:] + expected_output = next(lines_iter, None) + if expected_output: + results.append(f"assert {function_call} == {expected_output}") + + return results + + async def _execute(self, inputs: List[Any] = [], max_tries: int = 1, **kwargs): + """ + Execute the node with the given inputs. + """ + node_inputs = self.process_input(inputs) + node_outputs = [] + + for input in node_inputs: + if input.get('is_solved', False): + execution = deepcopy(input) + else: + task = input["task"] + if 'feedback' in input.keys(): + input = CODE_REACT_PROMPT.format(question=task, solution=input["output"], feedback=input["feedback"]) + else: + input = input["task"] + self.internal_tests = self.extract_example(task) + message = [] + message.append({"role":"system","content":self.prompt}) + for domenstration in self.domenstrations: + message.append({"role":"user","content":self.domenstration['input']}) + message.append({"role":"assistant","content":self.domenstration['output']}) + message.append({"role":"user","content":input}) + if self.env.inference_flag: + response = self.env.call_llm(messages=message) + else: + response = self.env.call_llm(messages=message,model_name=self.env.model_name_execute) + response = response.strip("```python\n").strip("```") + is_solved, feedback, _ = self.env.execute(response, self.internal_tests, timeout=10) + execution = { + "task": task, + "input": input, + "feedback": feedback, + "output": response, + "format": "python code", + "is_solved": is_solved, + } + if self.id not in self.env.memory: + self.env.memory[self.id] = [] + self.env.memory[self.id].append(execution) + node_outputs.append(execution) + + return node_outputs + + async def evaluate(self, candidate): + prompt, domenstrations = candidate + inputs = self.env.memory.get(self.id, []) + inputs = [record for record in self.env.memory.get(self.id,[])[-10:]] + score = 0 + for input in inputs: + messages = [] + messages.append({"role":"system","content":prompt}) + for domenstration in domenstrations: + messages.append({"role":"user","content":domenstration['input']}) + messages.append({"role":"assistant","content":domenstration['output']}) + messages.append({"role":"user","content":input['input']}) + response = self.env.call_llm(messages=messages) + response = response.strip("```python\n").strip("```") + tests = self.extract_example(input['task']) + is_solved, _, _ = self.env.execute(response, tests, timeout=10) + score += is_solved + + return score / len(inputs) +''' +class GenerateQuery(Node): + def __init__(self, + env, + operation_description: str = "Given a question, return what infomation is needed to answer the question.", + id=None): + super().__init__(operation_description, id, env,True) + + async def _execute(self, inputs: List[Any] = [], **kwargs): + youtube_regex = ( + r'(https?://)?(www\.)?' + '(youtube|youtu|youtube-nocookie)\.(com|be)/' + '(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})' + ) + node_inputs = self.process_input(inputs) + outputs = [] + + for input in node_inputs: + # Regular expression for matching URLs + url_pattern = r'https?://[^\s]+' + urls = re.findall(url_pattern, input["task"]) + download_paths = [] + + # Process each URL + for url in urls: + if bool(re.match(youtube_regex, url)): + download_path = self._youtube_download(url) + if download_path: + download_paths.append(download_path) + + files = input.get("files", []) + if not isinstance(files, list): + files = [] + files.extend(download_paths) + + + prompt = GAIA_PROMPT.format(question=input["task"]) + message = [{"role":"system", "content": GAIA_SYSTEM_PROMPT}, + {"role":"user","content": prompt}] + response = self.env.call_llm(messages=message) + + executions = { + "task": input["task"], + "files": files, + "input": input.get("task", None), + "subtask": prompt, + "output": response, + "format": "natural language"} + outputs.append(executions) + if self.id not in self.env.memory: + self.env.memory[self.id] = [] + self.env.memory[self.id].append(executions) + + return outputs + + def _youtube_download(self, url: str) -> str: + try: + video_id = url.split('v=')[-1].split('&')[0] + video_id = video_id.strip() + youtube = YouTube(url) + video_stream = youtube.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first() + if not video_stream: + raise ValueError("No suitable video stream found.") + + output_dir = "workspace_nopush/tmp" + os.makedirs(output_dir, exist_ok=True) + output_path = f"{output_dir}/{video_id}.mp4" + video_stream.download(output_path=output_dir, filename=f"{video_id}.mp4") + return output_path + + except Exception as e: + print(colored(f"Error downloading video from {url}: {e}","red")) + return "" + +class FileAnalyse(Node): + def __init__(self, + env, + operation_description: str = "Given a question, extract infomation from a file.", + id=None): + super().__init__(operation_description, id, env,True) + self.reader = GeneralReader() + + async def _execute(self, inputs: List[Any] = [], **kwargs): + node_inputs = self.process_input(inputs) + outputs = [] + for input in node_inputs: + query = input.get("output", "Please organize the information of this file.") + files = input["files"] + answer = '' + for file in files: + response = self.reader.read(query, file) + if not (isinstance(self.reader.file_reader, IMGReader) or isinstance(self.reader.file_reader, VideoReader)): + prompt = self.prompt_set.get_file_analysis_prompt(query=query, file=response) + response = self.env.call_llm(prompt=prompt) + answer += response + '\n' + + executions = { + "operation": self.node_name, + "task": input["task"], + "files": file, + "input": query, + "subtask": f"Read the content of ###{file}, use query ###{query}", + "output": response, + "format": "natural language" + } + + outputs.append(executions) + self.memory.add(self.id, executions) + return outputs + + def read(self, task, file): + files_content = "" + file_content = self.file_reader.read_file(file, task) + suffix = file.split(".")[-1] + + if suffix in ['py', 'java', 'cpp', 'c', 'js', 'css', 'html', 'htm', 'xml']: + files_content += f'\nThe {suffix} file contains:\n---\n{file_content[0]}' + if file_content[1] != '': + files_content += f'\nExecution result:\n{file_content[1]}' + if file_content[2] != '': + files_content += f'\nExecution error message:\n{file_content[2]}' + files_content += '\n---' + + elif suffix in ['txt', 'jsonl', 'csv', 'json', 'jsonld', 'jsonl', 'yaml', 'yml', + 'xlsx', 'xls', 'jpg', 'png', 'jpeg', 'gif', 'bmp', 'mp3', 'wav', + 'ogg', 'mp4', 'avi', 'mkv', 'mov', 'pdf', 'doc', 'docx', 'ppt', + 'pptx', 'md', 'markdown', 'tex', 'zip', 'tar', 'gz', '7z', 'rar']: + files_content += f'\nThe {suffix} file contains:\n---\n{file_content}\n---' + + return files_content + + +class WebSearch(Node): + def __init__(self, + domain: str, + model_name: Optional[str] = None, + operation_description: str = "Given a question, search the web for infomation.", + id=None): + super().__init__(operation_description, id, True) + self.domain = domain + self.llm = LLMRegistry.get(model_name) + self.prompt_set = PromptSetRegistry.get(domain) + self.role = self.prompt_set.get_role() + self.constraint = self.prompt_set.get_constraint() + self.searcher =self._get_searcher() + +class CombineAnswer(Node): + def __init__(self, + domain: str, + model_name: Optional[str] = None, + operation_description: str = "Combine multiple inputs into one.", + max_token: int = 500, + id=None): + super().__init__(operation_description, id, True) + self.domain = domain + self.llm = LLMRegistry.get(model_name) + self.max_token = max_token + self.prompt_set = PromptSetRegistry.get(self.domain) + self.role = self.prompt_set.get_role() + self.constraint = self.prompt_set.get_constraint() +''' \ No newline at end of file diff --git a/methods/gptswarm/configs/config.yaml b/methods/gptswarm/configs/config.yaml new file mode 100644 index 0000000..299e8d9 --- /dev/null +++ b/methods/gptswarm/configs/config.yaml @@ -0,0 +1,4 @@ +num-truthful-agent: 6 +num_iterations: 100 +#200 +lr: 0.1 \ No newline at end of file diff --git a/methods/gptswarm/evaluate_math_aflow.py b/methods/gptswarm/evaluate_math_aflow.py new file mode 100644 index 0000000..85de810 --- /dev/null +++ b/methods/gptswarm/evaluate_math_aflow.py @@ -0,0 +1,97 @@ +import regex,re +from termcolor import colored +from typing import Any +from math import isclose +from sympy import N, simplify +from sympy.parsing.latex import parse_latex +from sympy.parsing.sympy_parser import parse_expr + +def extract_model_answer(text: str) -> str: + pattern = r"\\boxed{((?:[^{}]|{[^{}]*})*)}" + boxed_matches = re.findall(pattern, text, re.DOTALL) + if boxed_matches: + return boxed_matches[-1].strip() + + sentence_end_pattern = r"(? bool: + if str(prediction) == str(reference): + return True + try: + if is_digit(prediction) and is_digit(reference): + prediction = parse_digits(prediction) + reference = parse_digits(reference) + return isclose(prediction, reference, abs_tol=1e-3) + except: + pass + + try: + return symbolic_equal(prediction, reference) + except: + pass + return False + +def is_digit(num): + return parse_digits(num) is not None + +def parse_digits(num): + num = regex.sub(",", "", str(num)) + try: + return float(num) + except: +# When the original input is a percentage in LaTeX format (e.g., 50\%), +# a backslash remains after processing, causing the float conversion to +# fail returning None, and subsequent math operations may produce type errors. +# num = num.replace("\\%", "").replace("%", "") + if num.endswith("%"): + num = num[:-1] + if num.endswith("\\"): + num = num[:-1] + try: + return float(num) / 100 + except: + pass + return None + +def symbolic_equal(a, b): + def _parse(s): + for f in [parse_latex, parse_expr]: + try: + return f(s) + except: + pass + return s + + a = _parse(a) + b = _parse(b) + + try: + if simplify(a - b) == 0: + return True + except: + pass + + try: + if isclose(N(a), N(b), abs_tol=1e-3): + return True + except: + pass + return False + diff --git a/methods/gptswarm/gptswarm_math.py b/methods/gptswarm/gptswarm_math.py new file mode 100644 index 0000000..1acafd9 --- /dev/null +++ b/methods/gptswarm/gptswarm_math.py @@ -0,0 +1,337 @@ +# The original repo also has “full” and “random” modes, but here we just use “optimized”. +import asyncio +import json +import os +import time +import numpy as np +import pandas as pd +import shortuuid +import torch + +from copy import deepcopy +from pathlib import Path +from termcolor import colored +from typing import Any,Iterator,Dict,List,Tuple + +from .agents import IO_MATH,FinalDecision +from ..mas_base import MAS +from ..utils import load_config +from .evaluate_math_aflow import grade_answer + +inference_execute_token_stats = {} + +class GPTswarm_MATH(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + method_config_name = "config" if method_config_name is None else method_config_name + self.method_config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.num_truthful_agents = self.method_config["num-truthful-agent"] + self.num_iterations = self.method_config["num_iterations"] + self.dataset_name = general_config['test_dataset_name'] + self.model_name_execute = self.method_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.results_path = f"results/{self.dataset_name}/gptswarm/{self.model_name_execute}" + self.lr = self.method_config["lr"] + self.used_agents = [] + self.agents = {"IO": IO_MATH} + + self.init_connection_probability = 0.5 + self.potential_connections = [] + + self.decision_method = FinalDecision(env=self) + + self.memory: Dict[str, List[Dict[str, Any]]] = {} + self.nodes = {} + self.graphs = [] + self.input_nodes = [] + self.output_nodes = [self.decision_method] + + self.inference_flag = True + + self.add_node(self.decision_method) + self.organize() + inference_execute_token_stats[self.model_name_execute] = {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} + + def organize(self): + agent_name_list = self.num_truthful_agents * ["IO"] + for agent_name in agent_name_list: + agent_instance = self.agents.get(agent_name)(env=self) + self.add_graph(agent_instance) + self.used_agents.append(agent_instance) + + # Add bi-directional connections between all nodes of all agents (except for the decision nodes). + for agent1 in self.used_agents: + for agent2 in self.used_agents: + if agent1 != agent2: + for node1 in agent1.nodes: + for node2 in agent2.nodes: + self.potential_connections.append((node1, node2)) # (from, to) + + for agent in self.used_agents: + for node in agent.nodes: + self.potential_connections.append((node, self.decision_method.id)) # (from, to) + + # Single scalar + init_logit = torch.log(torch.tensor(self.init_connection_probability / (1 - self.init_connection_probability))) + # The shape is one-dimensional and the length is len(self.potential_connections) + init_tensor = torch.ones(len(self.potential_connections),requires_grad=True) * init_logit + self.edge_logits = torch.nn.Parameter(init_tensor) + + # A collection of node IDs for each agent + node_ids = set([x for pair in self.potential_connections for x in pair]) + self.node_idx2id = {i: node_id for i, node_id in enumerate(node_ids)} + self.node_id2idx = {node_id: i for i, node_id in enumerate(node_ids)} + order_tensor = torch.randn(len(node_ids)) + self.order_params = torch.nn.Parameter(order_tensor) + + def add_graph(self, graph): + for node in graph.nodes.values(): + self.add_node(node) + self.graphs.append(graph) + self.input_nodes.extend(graph.input_nodes) + + def add_node(self, node): + """ + Creates and adds a new node to the graph. + If id is not provided, generates a unique id for the node. + """ + node_id = node.id if node.id is not None else shortuuid.ShortUUID().random(length=4) + while node_id in self.nodes: + node_id = shortuuid.ShortUUID().random(length=5) + node.id = node_id + self.nodes[node_id] = node + return node + + def check_cycle(self, new_node, target_nodes): + # Once a loop is detected, True is returned. + if new_node in target_nodes: + return True + for successor in new_node.successors: + if self.check_cycle(successor, target_nodes): + return True + return False + + def generate_graph(self,temperature: float = 1.0) -> Tuple[torch.Tensor]: + # randomly generate graph + log_probs = [torch.tensor(0.0, requires_grad=True)] + _graph = deepcopy(self) + for potential_connection, edge_logit in zip(self.potential_connections, self.edge_logits): + out_node = _graph.nodes.get(potential_connection[0]) + in_node = _graph.nodes.get(potential_connection[1]) + + if not out_node or not in_node: + continue + + if not _graph.check_cycle(in_node, {out_node}): + edge_prob = torch.sigmoid(edge_logit / temperature) + if torch.rand(1) < edge_prob: + out_node.add_successor(in_node) + log_probs.append(torch.log(edge_prob)) + else: + log_probs.append(torch.log(1 - edge_prob)) + + log_prob = torch.sum(torch.stack(log_probs)) + return _graph, log_prob + + def _generate_graph(self,edge_mask: torch.Tensor) -> Tuple[torch.Tensor]: + _graph = deepcopy(self) + for _, (potential_connection, is_edge) in enumerate(zip(self.potential_connections, edge_mask)): + out_node = _graph.nodes.get(potential_connection[0]) + in_node = _graph.nodes.get(potential_connection[1]) + + if not out_node or not in_node: + continue + + if not _graph.check_cycle(in_node, {out_node}): + if is_edge: + out_node.add_successor(in_node) + in_node.add_predecessor(out_node) + return _graph + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + self.inference_flag = True + optimized_path = Path(self.results_path) / "best_workflow.npy" + if optimized_path.exists(): + loaded_probs_npy = np.load(optimized_path) + self.edge_logits = torch.from_numpy(loaded_probs_npy) + edge_mask = self.edge_logits > 0.5 + graph = self._generate_graph(edge_mask) + input_dict = {"task": query} + response = graph._inference(input_dict) + self.token_stats[self.model_name]["num_llm_calls"] = graph.token_stats[self.model_name]["num_llm_calls"] + self.token_stats[self.model_name]["prompt_tokens"] = graph.token_stats[self.model_name]["prompt_tokens"] + self.token_stats[self.model_name]["completion_tokens"] = graph.token_stats[self.model_name]["completion_tokens"] + response = "\n".join(response) + else: + raise NotImplementedError("Best_workflow path does not exist!") + return response + + + def optimizing(self,val_dataset,batch_size: int = 4) -> torch.Tensor: + # Here mmlu is optimized with dev. + self.inference_flag = False + optimized_path = Path(self.results_path) / "best_workflow.npy" + if optimized_path.exists(): + print(colored("The optimal graph already exists!\n","red")) + return + print(colored("Optimizing swarm on MATHDataset split dev...","light_yellow")) + optimizer = torch.optim.Adam([self.edge_logits, self.order_params], lr=self.lr) + + def infinite_data_loader() -> Iterator[pd.DataFrame]: + perm = np.random.permutation(len(val_dataset)) + while True: + for idx in perm: + record = val_dataset[idx.item()] + yield record + + loader = infinite_data_loader() + + edge_probs = None + for i_iter in range(self.num_iterations): + print(f"Iter {i_iter}", 80*'-') + start_ts = time.time() + raw_answers = [] + log_probs = [] + correct_answers = [] + + for _, record in zip(range(batch_size), loader): + + graph, log_prob = self.generate_graph() + + demo_question = (f"{record['query']}\n") + input_dict = {"task": demo_question} + answer = graph._inference(input_dict) + inference_execute_token_stats[self.model_name_execute]["num_llm_calls"] += graph.token_stats[self.model_name_execute]["num_llm_calls"] + inference_execute_token_stats[self.model_name_execute]["prompt_tokens"] += graph.token_stats[self.model_name_execute]["prompt_tokens"] + inference_execute_token_stats[self.model_name_execute]["completion_tokens"] += graph.token_stats[self.model_name_execute]["completion_tokens"] + # print(colored(answer,"light_cyan")) + raw_answers.append(answer) + log_probs.append(log_prob) + + correct_answer = record.get('solution', record.get('gt')) + if correct_answer is None: + raise KeyError("Expected 'solution' or 'gt' in record for grading.") + + assert isinstance(correct_answer, str), ( + f"String expected but got {correct_answer} " + f"of type {type(correct_answer)} (2)" \ + f" record={record}") + correct_answers.append(correct_answer) + + + + print(f"Batch time {time.time() - start_ts:.3f}") + + loss_list: List[torch.Tensor] = [] + utilities: List[float] = [] + _num_correct = 0 + _num_total = 0 + + for raw_answer, log_prob, correct_answer in zip(raw_answers, log_probs, correct_answers): + if isinstance(raw_answer, list): + if len(raw_answer) > 0: + answer = raw_answer[0] + else: + answer = "" + if not isinstance(answer, str): + raise Exception("Expected string") + + assert isinstance(correct_answer, str), \ + f"String expected but got {correct_answer} of type {type(correct_answer)} (1)" + + is_correct = grade_answer(answer,correct_answer) + _num_correct += int(is_correct) + _num_total += 1 + utility = _num_correct / _num_total + utilities.append(utility) + single_loss = - log_prob * utility + loss_list.append(single_loss) + + print("utilities:", utilities) + total_loss = torch.mean(torch.stack(loss_list)) + print("loss:", total_loss.item()) + optimizer.zero_grad() + total_loss.backward() + print("Grad:", self.edge_logits.grad) + optimizer.step() + print("edge_logits:", self.edge_logits) + edge_probs = torch.sigmoid(self.edge_logits) + print("edge_probs:", edge_probs) + print("end of iteration") + + print(colored("Done!","green")) + token_path = os.path.join(self.results_path,"api_token.json") + os.makedirs(os.path.dirname(token_path), exist_ok=True) + with open(token_path,"w") as f: + json.dump(inference_execute_token_stats, f, indent=4) + edge_probs_np = self.edge_logits.detach().numpy() + graph_path = self.results_path + if not os.path.exists(graph_path): + os.makedirs(graph_path) + dest = os.path.join(graph_path, "best_workflow.npy") + np.save(dest, edge_probs_np) + print(colored("Best graph saved!","light_yellow")) + + def _inference(self, inputs: Dict[str, Any], max_tries: int = 3, max_time: int = 600) -> List[Any]: + + def is_node_useful(node): + if node in self.output_nodes: + return True + for successor in node.successors: + if is_node_useful(successor): + return True + return False + + useful_node_ids = [node_id for node_id, node in self.nodes.items() if is_node_useful(node)] + in_degree = {node_id: len(self.nodes[node_id].predecessors) for node_id in useful_node_ids} + # Contains the IDs of all useful nodes with zero intake + zero_in_degree_queue = [node_id for node_id, deg in in_degree.items() if deg == 0 and node_id in useful_node_ids] + + for _, input_node in enumerate(self.input_nodes): + node_input = deepcopy(inputs) + input_node.inputs = [node_input] + + while zero_in_degree_queue: + current_node_id = zero_in_degree_queue.pop(0) + current_node = self.nodes[current_node_id] + tries = 0 + while tries < max_tries: + try: + asyncio.run(self.nodes[current_node_id].execute()) + break + except asyncio.TimeoutError: + print(f"Node {current_node_id} execution timed out, retrying {tries + 1} out of {max_tries}...") + except Exception as e: + print(f"Error during execution of node {current_node_id}: {e}") + break + tries += 1 + + for successor in current_node.successors: + if successor.id in useful_node_ids: + in_degree[successor.id] -= 1 + if in_degree[successor.id] == 0: + zero_in_degree_queue.append(successor.id) + + final_answers = [] + + for output_node in self.output_nodes: + output_messages = output_node.outputs + # return all outputs + if len(output_messages) > 0: + final_answer = output_messages[-1].get("output", output_messages[-1]) + final_answers.append(final_answer) + else: + for output_message in output_messages: + final_answer = output_message.get("output", output_message) + final_answers.append(final_answer) + + if len(final_answers) == 0: + final_answers.append("No answer since there are no inputs provided") + return final_answers + + + + + \ No newline at end of file diff --git a/methods/gptswarm/gptswarm_mmlu.py b/methods/gptswarm/gptswarm_mmlu.py new file mode 100644 index 0000000..d55a61a --- /dev/null +++ b/methods/gptswarm/gptswarm_mmlu.py @@ -0,0 +1,322 @@ +# The original repo also has “full” and “random” modes, but here we just use “optimized”. +import asyncio +import os +import time +import numpy as np +import pandas as pd +import shortuuid +import torch + +from copy import deepcopy +from pathlib import Path +from termcolor import colored +from typing import Any,Iterator,Dict,List,Tuple + +from .agents import IO,FinalDecision +from ..mas_base import MAS +from ..utils import load_config + + +class GPTswarm_MMLU(MAS): + def __init__(self, general_config, method_config_name="config"): + super().__init__(general_config) + method_config_name = "config" if method_config_name is None else method_config_name + self.method_config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) + self.num_truthful_agents = self.method_config["num-truthful-agent"] + self.num_iterations = self.method_config["num_iterations"] + self.dataset_name = general_config['test_dataset_name'] + self.model_name_execute = general_config.get('optimize_execute_model_name','gpt-4o-mini-2024-07-18') + self.results_path = f"results/{self.dataset_name}/gptswarm/{self.model_name_execute}" + self.lr = self.method_config["lr"] + self.used_agents = [] + self.agents = {"IO": IO} + + self.init_connection_probability = 0.5 + self.potential_connections = [] + + self.decision_method = FinalDecision(env=self) + + self.memory: Dict[str, List[Dict[str, Any]]] = {} + self.nodes = {} + self.graphs = [] + self.input_nodes = [] + self.output_nodes = [self.decision_method] + + self.inference_flag = True + + self.add_node(self.decision_method) + self.organize() + + def organize(self): + agent_name_list = self.num_truthful_agents * ["IO"] + for agent_name in agent_name_list: + agent_instance = self.agents.get(agent_name)(env=self) + self.add_graph(agent_instance) + self.used_agents.append(agent_instance) + + # Add bi-directional connections between all nodes of all agents (except for the decision nodes). + for agent1 in self.used_agents: + for agent2 in self.used_agents: + if agent1 != agent2: + for node1 in agent1.nodes: + for node2 in agent2.nodes: + self.potential_connections.append((node1, node2)) # (from, to) + + for agent in self.used_agents: + for node in agent.nodes: + self.potential_connections.append((node, self.decision_method.id)) # (from, to) + + # Single scalar + init_logit = torch.log(torch.tensor(self.init_connection_probability / (1 - self.init_connection_probability))) + # The shape is one-dimensional and the length is len(self.potential_connections) + init_tensor = torch.ones(len(self.potential_connections),requires_grad=True) * init_logit + self.edge_logits = torch.nn.Parameter(init_tensor) + + # A collection of node IDs for each agent + node_ids = set([x for pair in self.potential_connections for x in pair]) + self.node_idx2id = {i: node_id for i, node_id in enumerate(node_ids)} + self.node_id2idx = {node_id: i for i, node_id in enumerate(node_ids)} + order_tensor = torch.randn(len(node_ids)) + self.order_params = torch.nn.Parameter(order_tensor) + + def add_graph(self, graph): + for node in graph.nodes.values(): + self.add_node(node) + self.graphs.append(graph) + self.input_nodes.extend(graph.input_nodes) + + def add_node(self, node): + """ + Creates and adds a new node to the graph. + If id is not provided, generates a unique id for the node. + """ + node_id = node.id if node.id is not None else shortuuid.ShortUUID().random(length=4) + while node_id in self.nodes: + node_id = shortuuid.ShortUUID().random(length=5) + node.id = node_id + self.nodes[node_id] = node + return node + + def check_cycle(self, new_node, target_nodes): + # Once a loop is detected, True is returned. + if new_node in target_nodes: + return True + for successor in new_node.successors: + if self.check_cycle(successor, target_nodes): + return True + return False + + def generate_graph(self,temperature: float = 1.0) -> Tuple[torch.Tensor]: + # randomly generate graph + log_probs = [torch.tensor(0.0, requires_grad=True)] + _graph = deepcopy(self) + for potential_connection, edge_logit in zip(self.potential_connections, self.edge_logits): + out_node = _graph.nodes.get(potential_connection[0]) + in_node = _graph.nodes.get(potential_connection[1]) + + if not out_node or not in_node: + continue + + if not _graph.check_cycle(in_node, {out_node}): + edge_prob = torch.sigmoid(edge_logit / temperature) + if torch.rand(1) < edge_prob: + out_node.add_successor(in_node) + log_probs.append(torch.log(edge_prob)) + else: + log_probs.append(torch.log(1 - edge_prob)) + + log_prob = torch.sum(torch.stack(log_probs)) + return _graph, log_prob + + def _generate_graph(self,edge_mask: torch.Tensor) -> Tuple[torch.Tensor]: + _graph = deepcopy(self) + for i, (potential_connection, is_edge) in enumerate(zip(self.potential_connections, edge_mask)): + out_node = _graph.nodes.get(potential_connection[0]) + in_node = _graph.nodes.get(potential_connection[1]) + + if not out_node or not in_node: + continue + + if not _graph.check_cycle(in_node, {out_node}): + if is_edge: + out_node.add_successor(in_node) + in_node.add_predecessor(out_node) + return _graph + + def inference(self, sample): + query = sample.get("query") + if not query: + raise ValueError("Sample must contain a 'query' key.") + self.inference_flag = True + optimized_path = Path(self.results_path) / "best_workflow.npy" + if optimized_path.exists(): + loaded_probs_npy = np.load(optimized_path) + self.edge_logits = torch.from_numpy(loaded_probs_npy) + edge_mask = self.edge_logits > 0.5 + graph = self._generate_graph(edge_mask) + input_dict = {"task": query} + response = graph._inference(input_dict) + response = "\n".join(response) + else: + raise NotImplementedError("Best_workflow path does not exist!") + return response + + + def optimizing(self,val_dataset,batch_size: int = 4) -> torch.Tensor: + # Here mmlu is optimized with dev. + self.inference_flag = False + optimized_path = Path(self.results_path) / "best_workflow.npy" + if optimized_path.exists(): + print(colored("The optimal graph already exists!\n","red")) + return + print(colored("Optimizing swarm on MMLUDataset split dev...","light_yellow")) + optimizer = torch.optim.Adam([self.edge_logits, self.order_params], lr=self.lr) + + def infinite_data_loader() -> Iterator[pd.DataFrame]: + perm = np.random.permutation(len(val_dataset)) + while True: + for idx in perm: + record = val_dataset[idx.item()] + yield record + + loader = infinite_data_loader() + + edge_probs = None + for i_iter in range(self.num_iterations): + print(f"Iter {i_iter}", 80*'-') + start_ts = time.time() + raw_answers = [] + log_probs = [] + correct_answers = [] + + for _, record in zip(range(batch_size), loader): + + graph, log_prob = self.generate_graph() + + demo_question = (f"{record['query']}\n") + input_dict = {"task": demo_question} + answer = graph._inference(input_dict) + print(colored(answer,"light_cyan")) + raw_answers.append(answer) + log_probs.append(log_prob) + + correct_answer = record['gt'][1] + + assert isinstance(correct_answer, str), ( + f"String expected but got {correct_answer} " + f"of type {type(correct_answer)} (2)" \ + f" record={record}") + correct_answers.append(correct_answer) + + + + print(f"Batch time {time.time() - start_ts:.3f}") + + loss_list: List[torch.Tensor] = [] + utilities: List[float] = [] + _num_correct = 0 + _num_total = 0 + + for raw_answer, log_prob, correct_answer in zip(raw_answers, log_probs, correct_answers): + if isinstance(raw_answer, list): + if len(raw_answer) > 0: + answer = raw_answer[0] + else: + answer = "" + if not isinstance(answer, str): + raise Exception("Expected string") + if len(answer) > 0: + answer = answer[0] # Try to format the answer by taking the first letter + assert isinstance(correct_answer, str), \ + f"String expected but got {correct_answer} of type {type(correct_answer)} (1)" + + is_correct = answer == correct_answer + _num_correct += int(is_correct) + _num_total += 1 + utility = _num_correct / _num_total + utilities.append(utility) + single_loss = - log_prob * utility + loss_list.append(single_loss) + + print("utilities:", utilities) + total_loss = torch.mean(torch.stack(loss_list)) + print("loss:", total_loss.item()) + optimizer.zero_grad() + total_loss.backward() + print("Grad:", self.edge_logits.grad) + optimizer.step() + print("edge_logits:", self.edge_logits) + edge_probs = torch.sigmoid(self.edge_logits) + print("edge_probs:", edge_probs) + print("end of iteration") + + print(colored("Done!","green")) + edge_probs_np = self.edge_logits.detach().numpy() + graph_path = self.results_path + if not os.path.exists(graph_path): + os.makedirs(graph_path) + dest = os.path.join(graph_path, "best_workflow.npy") + np.save(dest, edge_probs_np) + print(colored("Best graph saved!","light_yellow")) + + def _inference(self, inputs: Dict[str, Any], max_tries: int = 3, max_time: int = 600) -> List[Any]: + + def is_node_useful(node): + if node in self.output_nodes: + return True + for successor in node.successors: + if is_node_useful(successor): + return True + return False + + useful_node_ids = [node_id for node_id, node in self.nodes.items() if is_node_useful(node)] + in_degree = {node_id: len(self.nodes[node_id].predecessors) for node_id in useful_node_ids} + # Contains the IDs of all useful nodes with zero intake + zero_in_degree_queue = [node_id for node_id, deg in in_degree.items() if deg == 0 and node_id in useful_node_ids] + + for i, input_node in enumerate(self.input_nodes): + node_input = deepcopy(inputs) + input_node.inputs = [node_input] + + while zero_in_degree_queue: + current_node_id = zero_in_degree_queue.pop(0) + current_node = self.nodes[current_node_id] + tries = 0 + while tries < max_tries: + try: + asyncio.run(self.nodes[current_node_id].execute()) + break + except asyncio.TimeoutError: + print(f"Node {current_node_id} execution timed out, retrying {tries + 1} out of {max_tries}...") + except Exception as e: + print(f"Error during execution of node {current_node_id}: {e}") + break + tries += 1 + + for successor in current_node.successors: + if successor.id in useful_node_ids: + in_degree[successor.id] -= 1 + if in_degree[successor.id] == 0: + zero_in_degree_queue.append(successor.id) + + final_answers = [] + + for output_node in self.output_nodes: + output_messages = output_node.outputs + # return all outputs + if len(output_messages) > 0: + final_answer = output_messages[-1].get("output", output_messages[-1]) + final_answers.append(final_answer) + else: + for output_message in output_messages: + final_answer = output_message.get("output", output_message) + final_answers.append(final_answer) + + if len(final_answers) == 0: + final_answers.append("No answer since there are no inputs provided") + return final_answers + + + + + \ No newline at end of file diff --git a/methods/gptswarm/prompt.py b/methods/gptswarm/prompt.py new file mode 100644 index 0000000..8dc681f --- /dev/null +++ b/methods/gptswarm/prompt.py @@ -0,0 +1,79 @@ +MMLU_SYSTEM_PROMPT = """ +You are a knowlegable expert in question answering. +I will ask you a question. +I will also give you 4 answers enumerated as A, B, C and D. +Only one answer out of the offered 4 is correct. +You must choose the correct answer to the question. +Your response must be one of the 4 letters: A, B, C or D,corresponding to the correct answer. +Only one letter (A, B, C or D) is allowed in your answer. +""" + +MATH_SYSTEM_PROMPT = """ +You are a knowlegable expert in math. +I will ask you a math question.Please answer the question. +""" + +CODE_PROMPT = """ +You are an AI that only responds with only Python code. +You will be given a function signature and its docstring by the user. +Write your full implementation (restate the function signature). +Use a Python code block to write your response. For example: +```python +print('Hello world!') +``` +""" +CODE_REACT_PROMPT = """ +Here is an unsuccessful attempt for solving the folloing question: +Question: +{question} +Attempted Solution: +{solution} +Feedback: +{feedback} +Rewrite the code based on the feedback and the following question: +{question}""" + +META_PROMPT1 = """ +Here is an example when a Python code generator gets wrong. +Input: +{input} +------------------ +The output was: +{output} +------------------ +It received the following feedback: +{feedback} +Identify a problem in a Python code generator from the given example and suggest how to prevent it without mentioning the specific example. +Respond only one sentence. +""" + +META_PROMPT2 = """ +I'm trying to define a Python code generator by prompting. +My current prompt is: +"{prompt}" + +To generate an improved prompt, consider the following: +{advice} +Generate an improved prompt within five sentences. Do not mention a specific task in the prompt! +The prompt should be wrapped with and . +""" + +GAIA_SYSTEM_PROMPT = """ +You are a a general AI assistant. +""" + +GAIA_PROMPT = """ +# Information Gathering for Question Resolution + + +Evaluate if additional information is needed to answer the question. +If a web search or file analysis is necessary, outline specific clues or details to be searched for. + + +## ❓ Target Question: +{question} + + +## 🔍 Clues for Investigation: +Identify critical clues and concepts within the question that are essential for finding the answer. +""" \ No newline at end of file diff --git a/methods/gptswarm/utils.py b/methods/gptswarm/utils.py new file mode 100644 index 0000000..b83b35b --- /dev/null +++ b/methods/gptswarm/utils.py @@ -0,0 +1,34 @@ +import ast +import astunparse +import concurrent.futures + +def function_with_timeout(func, args, timeout): + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(func, *args) + try: + return future.result(timeout=timeout) + except concurrent.futures.TimeoutError: + future.cancel() + raise TimeoutError(f"Function execution exceeded the timeout limit of {timeout} seconds") + except Exception as exc: + raise exc + +def get_output(func: str, assert_statement: str, timeout: int = 5) -> str: + try: + exec(f"from typing import *\n{func}", globals()) + func_call = get_call_str(assert_statement) + output = function_with_timeout(eval, (func_call, globals()), timeout) + return output + except TimeoutError: + return "TIMEOUT" + except Exception as e: + return str(e) + +def get_call_str(assert_statement: str) -> str: + ast_parsed = ast.parse(assert_statement) + try: + call_str = ast_parsed.body[0].test.left # type: ignore + except: + call_str = ast_parsed.body[0].test # type: ignore + + return astunparse.unparse(call_str).strip() \ No newline at end of file