diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 904110dc..2ed9bae1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ minimum_pre_commit_version: "2.9.0" repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: - id: check-yaml args: [--allow-multiple-documents] @@ -15,20 +15,20 @@ repos: hooks: - id: prettier - repo: https://github.com/asottile/reorder-python-imports - rev: v3.10.0 + rev: v3.16.0 hooks: - id: reorder-python-imports args: [--py39-plus] - - repo: https://github.com/psf/black - rev: 23.3.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 26.3.1 hooks: - id: black - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.3.0 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.1 + rev: v1.20.0 hooks: - id: mypy additional_dependencies: [numpy, httpx, pytest, structlog, types-PyYAML] diff --git a/ice/agent.py b/ice/agent.py index ae3c6b6e..7ee5a951 100644 --- a/ice/agent.py +++ b/ice/agent.py @@ -21,8 +21,7 @@ except ImportError: class Tfew(Agent): - def __init__(self, *args, **kwargs): - ... + def __init__(self, *args, **kwargs): ... MACHINE_AGENTS = { diff --git a/ice/agents/augmented.py b/ice/agents/augmented.py index 1cb81433..1f8db867 100644 --- a/ice/agents/augmented.py +++ b/ice/agents/augmented.py @@ -66,7 +66,7 @@ async def classify( default: Optional[str] = None, verbose: bool = False, ) -> tuple[dict[str, float], Optional[str]]: - (machine_probs, explanation) = await self.machine.classify( + machine_probs, explanation = await self.machine.classify( prompt=prompt, choices=choices, default=default, diff --git a/ice/cache.py b/ice/cache.py index 4a72ee6f..60936cc5 100644 --- a/ice/cache.py +++ b/ice/cache.py @@ -1,6 +1,7 @@ """ Decorator for caching function results to disk """ + import asyncio import functools import inspect diff --git a/ice/datasets/qasper.py b/ice/datasets/qasper.py index b0774a11..6b87f82e 100644 --- a/ice/datasets/qasper.py +++ b/ice/datasets/qasper.py @@ -13,7 +13,6 @@ from ice.paper import split_sentences from ice.recipes.meta.eval_paper_qa.types import PaperQaGoldStandard - TRAIN_PATH = "/code/datasets/qasper-train-v0.3.json" VAL_PATH = "/code/datasets/qasper-dev-v0.3.json" diff --git a/ice/evaluation/evaluate_recipe_result.py b/ice/evaluation/evaluate_recipe_result.py index 824ea578..6464dea5 100644 --- a/ice/evaluation/evaluate_recipe_result.py +++ b/ice/evaluation/evaluate_recipe_result.py @@ -138,9 +138,7 @@ def __str__(self) -> str: correctness = ( "Correct" if self.is_correct == True - else "Incorrect" - if self.is_correct == False - else "Not evaluated" + else "Incorrect" if self.is_correct == False else "Not evaluated" ) return f"""{correctness}. - Predicted: {self.predicted} @@ -196,13 +194,17 @@ def evaluated_classifications(self) -> list[EvaluatedClassification]: for i in range(0, max(len(recipe_classifications), len(gold_classifications))): evaluated_classification = EvaluatedClassification( - predicted=recipe_classifications[i] - if i < len(recipe_classifications) - else None, + predicted=( + recipe_classifications[i] + if i < len(recipe_classifications) + else None + ), gold=gold_classifications[i] if i < len(gold_classifications) else None, - classification_eq=self.classification_eq[i] - if i < len(self.classification_eq) - else None, + classification_eq=( + self.classification_eq[i] + if i < len(self.classification_eq) + else None + ), ) evaluated_classifications.append(evaluated_classification) diff --git a/ice/evaluation/evaluation_report.py b/ice/evaluation/evaluation_report.py index c41d1791..34727bae 100644 --- a/ice/evaluation/evaluation_report.py +++ b/ice/evaluation/evaluation_report.py @@ -382,9 +382,9 @@ def make_dashboard_row_df(self): classification_summary.proportion_correct ) - row[ - f"Classification {i+1} # evaluated" - ] = classification_summary.num_evaluated + row[f"Classification {i+1} # evaluated"] = ( + classification_summary.num_evaluated + ) df = pd.DataFrame([row]) df.to_csv( @@ -408,9 +408,9 @@ def make_experiments_evaluation_df(self): "ice_commit": latest_commit_hash(), "document_id": result.document_id, "split": result.gold_standard.split if result.gold_standard else None, - "experiment": result.gold_standard.experiment - if result.gold_standard - else None, + "experiment": ( + result.gold_standard.experiment if result.gold_standard else None + ), "total_gs_quotes": len( result.evaluated_excerpts.gold_standards_in_excerpts_results ), @@ -420,9 +420,9 @@ def make_experiments_evaluation_df(self): "excerpts": result.evaluated_excerpts.excerpts, "gs_quotes": result.evaluated_excerpts.gold_standards_str(), "answer": result.answer, - "gs_answer": result.gold_standard.answer - if result.gold_standard - else None, + "gs_answer": ( + result.gold_standard.answer if result.gold_standard else None + ), "answer_rating": result.answer_rating, "failure_modes": result.failure_modes, } diff --git a/ice/evaluation/summarize_experiment_evals.py b/ice/evaluation/summarize_experiment_evals.py index 7079baa1..11683d78 100644 --- a/ice/evaluation/summarize_experiment_evals.py +++ b/ice/evaluation/summarize_experiment_evals.py @@ -35,13 +35,17 @@ async def summarize_experiment_evals(results_file: str): row.get("classification_1"), row.get("classification_2"), ], - answer_rating=None - if pd.isna(row.get("answer_rating")) - else int(row.get("answer_rating")), + answer_rating=( + None + if pd.isna(row.get("answer_rating")) + else int(row.get("answer_rating")) + ), elicit_commit=row.get("elicit_commit"), - failure_modes=None - if pd.isna(row.get("failure_modes")) - else row.failure_modes.split(","), + failure_modes=( + None + if pd.isna(row.get("failure_modes")) + else row.failure_modes.split(",") + ), ) for _, row in recipe_df.iterrows() ] diff --git a/ice/formatter/multi.py b/ice/formatter/multi.py index e0d4ac99..668222e6 100644 --- a/ice/formatter/multi.py +++ b/ice/formatter/multi.py @@ -50,7 +50,7 @@ def _is_partial(**fields: Union[literal, _NotNeededSentinel]): def all_values_needed( - examples: Sequence[Mapping[str, Union[literal_or_transform, _NotNeededSentinel]]] + examples: Sequence[Mapping[str, Union[literal_or_transform, _NotNeededSentinel]]], ) -> TypeGuard[Sequence[Mapping[str, literal_or_transform]]]: return all( ( @@ -107,7 +107,7 @@ def _unparse(parses: _StdLibFormatStringParses) -> str: def _no_sentinels_remaining( - concrete_values: dict[str, Union[literal, _NotNeededSentinel]] + concrete_values: dict[str, Union[literal, _NotNeededSentinel]], ) -> TypeGuard[dict[str, literal]]: return all( (value is not _not_needed_sentinel for value in concrete_values.values()) @@ -146,7 +146,7 @@ def _format_truncate( def _has_stop( - concrete_values: Mapping[str, Union[literal, _NotNeededSentinel]] + concrete_values: Mapping[str, Union[literal, _NotNeededSentinel]], ) -> bool: return any(isinstance(value, StopSentinel) for value in concrete_values.values()) diff --git a/ice/formatter/transform/dependent.py b/ice/formatter/transform/dependent.py index 280bde7b..175b0f58 100644 --- a/ice/formatter/transform/dependent.py +++ b/ice/formatter/transform/dependent.py @@ -9,7 +9,6 @@ from ice.formatter.transform import _Transform - T_contra = TypeVar("T_contra", contravariant=True) diff --git a/ice/formatter/transform/value.py b/ice/formatter/transform/value.py index 2a28d4a8..fa29d2a4 100644 --- a/ice/formatter/transform/value.py +++ b/ice/formatter/transform/value.py @@ -6,7 +6,6 @@ from ice.formatter.transform import _Transform - T_contra = TypeVar("T_contra", contravariant=True) diff --git a/ice/json_value.py b/ice/json_value.py index 5a19d373..0151c37e 100644 --- a/ice/json_value.py +++ b/ice/json_value.py @@ -6,7 +6,6 @@ from fvalues import F - JSONValue = Union[ str, int, float, bool, None, list["JSONValue"], dict[str, "JSONValue"] ] diff --git a/ice/metrics/gold_paragraphs.py b/ice/metrics/gold_paragraphs.py index 9e5d85e6..2319b93b 100644 --- a/ice/metrics/gold_paragraphs.py +++ b/ice/metrics/gold_paragraphs.py @@ -1,6 +1,7 @@ """ Make a dataframe that contains the paragraphs that contain the gold standard quotes. """ + import asyncio from pathlib import Path from typing import Optional @@ -72,15 +73,13 @@ def get_containing_paragraph( # Explanations: # - Quote is split across two paragraphs # - Document paragraphs don't include quote - log.warning( - f"""Couldn't find gold standard paragraph for quote + log.warning(f"""Couldn't find gold standard paragraph for quote > {quote} in {document_id}. Best recall was {best_recall:.2f}. Best paragraph was: -> {best_recall_paragraph}""" - ) +> {best_recall_paragraph}""") return best_recall_paragraph diff --git a/ice/metrics/gold_standards.py b/ice/metrics/gold_standards.py index 7c0e43d6..96e8ee89 100644 --- a/ice/metrics/gold_standards.py +++ b/ice/metrics/gold_standards.py @@ -184,8 +184,7 @@ def get_gold_standards( question_short_name: Optional[str] = None, experiment: Optional[str] = None, model_type: None = None, -) -> list[GoldStandard[Any]]: - ... +) -> list[GoldStandard[Any]]: ... @overload @@ -195,8 +194,7 @@ def get_gold_standards( document_id: Optional[str] = None, question_short_name: Optional[str] = None, experiment: Optional[str] = None, -) -> list[GoldStandard[ParsedGoldStandardType]]: - ... +) -> list[GoldStandard[ParsedGoldStandardType]]: ... def get_gold_standards( @@ -226,8 +224,7 @@ def get_gold_standard( question_short_name: Optional[str] = None, experiment: Optional[str] = None, model_type: None = None, -) -> Optional[GoldStandard[Any]]: - ... +) -> Optional[GoldStandard[Any]]: ... @overload @@ -237,8 +234,7 @@ def get_gold_standard( document_id: Optional[str] = None, question_short_name: Optional[str] = None, experiment: Optional[str] = None, -) -> Optional[GoldStandard[ParsedGoldStandardType]]: - ... +) -> Optional[GoldStandard[ParsedGoldStandardType]]: ... def get_gold_standard( diff --git a/ice/paper.py b/ice/paper.py index 8dcd9337..65526b3e 100644 --- a/ice/paper.py +++ b/ice/paper.py @@ -93,9 +93,9 @@ def parse_txt(file: Path) -> list[dict]: "number": section_title_number(current_section), } ], - "sectionType": "abstract" - if current_section == "Abstract" - else "main", + "sectionType": ( + "abstract" if current_section == "Abstract" else "main" + ), } ) return body diff --git a/ice/recipes/adherence_tfew_paragraph.py b/ice/recipes/adherence_tfew_paragraph.py index 082f1153..45ce85a0 100644 --- a/ice/recipes/adherence_tfew_paragraph.py +++ b/ice/recipes/adherence_tfew_paragraph.py @@ -24,7 +24,6 @@ from ice.utils import map_async from ice.utils import max_by_value - gpt2_tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained("gpt2") diff --git a/ice/recipes/blinding_dynamic.py b/ice/recipes/blinding_dynamic.py index 31160bbe..3fd790e3 100644 --- a/ice/recipes/blinding_dynamic.py +++ b/ice/recipes/blinding_dynamic.py @@ -21,6 +21,7 @@ - routledge-2006.pdf - vittengl-2009.pdf """ + import itertools from typing import Any from typing import Literal @@ -344,9 +345,9 @@ async def run(self, paper: Paper): results_by_intervention: dict[str, dict[Group, dict[str, Any]]] = {} interventions = await self.interventions(paper) for intervention in interventions: - results_by_intervention[ - intervention - ] = await self.blinding_for_intervention(paper, intervention) + results_by_intervention[intervention] = ( + await self.blinding_for_intervention(paper, intervention) + ) recipe_results: list[RecipeResult] = [] for intervention in interventions: diff --git a/ice/recipes/consort_flow/baseline_elicit_answer.py b/ice/recipes/consort_flow/baseline_elicit_answer.py index d46c10c3..e8c4fea2 100644 --- a/ice/recipes/consort_flow/baseline_elicit_answer.py +++ b/ice/recipes/consort_flow/baseline_elicit_answer.py @@ -7,7 +7,6 @@ from ice.apis.openai import openai_complete from ice.recipes.program_search.nodes.answer.types import Demonstration - log = get_logger() diff --git a/ice/recipes/consort_flow/baselines.py b/ice/recipes/consort_flow/baselines.py index 7dcac638..35ff22de 100644 --- a/ice/recipes/consort_flow/baselines.py +++ b/ice/recipes/consort_flow/baselines.py @@ -342,9 +342,11 @@ async def _all_options( except TooLongRequestError: selections = remove_lowest_perplexity(selections) return PaperQaAnswer( - answer=["The question is not answered in the text."] - if do_return_list - else "The question is not answered in the text.", + answer=( + ["The question is not answered in the text."] + if do_return_list + else "The question is not answered in the text." + ), support_candidates=texts, support_labels=[False for text in texts], support_scores=[t[1] for t in texts_with_perplexities], diff --git a/ice/recipes/consort_flow/golds.py b/ice/recipes/consort_flow/golds.py index 0fafae77..5670429a 100644 --- a/ice/recipes/consort_flow/golds.py +++ b/ice/recipes/consort_flow/golds.py @@ -54,9 +54,11 @@ def paper_to_allocation_gold_standards( ( f"The {exp.name} experiment included {len(exp.arms or [])} arms: {', '.join((arm.name for arm in exp.arms or []))}. How many participants were initially allocated to the {arm.name} arm of the {exp.name} experiment?", texts, - arm.allocated.quotes - if arm.allocated and isinstance(arm.allocated, SampleSize) - else [], + ( + arm.allocated.quotes + if arm.allocated and isinstance(arm.allocated, SampleSize) + else [] + ), ) for exp in gs.parsed_answer.experiments for arm in (exp.arms or []) diff --git a/ice/recipes/experiments_and_arms/prompts/can_name_arms.py b/ice/recipes/experiments_and_arms/prompts/can_name_arms.py index 11824dfe..32660d13 100644 --- a/ice/recipes/experiments_and_arms/prompts/can_name_arms.py +++ b/ice/recipes/experiments_and_arms/prompts/can_name_arms.py @@ -13,7 +13,6 @@ from ice.recipes.experiments_and_arms.prompts.utils import start_last_example from ice.recipes.experiments_and_arms.types import MultipartReasoningPrompt - CAN_WE_NAME_ARMS_EXAMPLES: list[ dict[str, Union[ValueTransform[Sequence[str]], str, int]] ] = [ diff --git a/ice/recipes/experiments_and_arms/prompts/cluster.py b/ice/recipes/experiments_and_arms/prompts/cluster.py index c93208a6..759dd7fb 100644 --- a/ice/recipes/experiments_and_arms/prompts/cluster.py +++ b/ice/recipes/experiments_and_arms/prompts/cluster.py @@ -2,7 +2,6 @@ from structlog.stdlib import get_logger - log = get_logger() diff --git a/ice/recipes/experiments_and_arms/prompts/consensus.py b/ice/recipes/experiments_and_arms/prompts/consensus.py index feedc76f..15529eb1 100644 --- a/ice/recipes/experiments_and_arms/prompts/consensus.py +++ b/ice/recipes/experiments_and_arms/prompts/consensus.py @@ -2,7 +2,6 @@ from structlog.stdlib import get_logger - log = get_logger() diff --git a/ice/recipes/experiments_and_arms/recipes/best_passages.py b/ice/recipes/experiments_and_arms/recipes/best_passages.py index eded3d83..0546bd70 100644 --- a/ice/recipes/experiments_and_arms/recipes/best_passages.py +++ b/ice/recipes/experiments_and_arms/recipes/best_passages.py @@ -43,9 +43,11 @@ async def rank_passages_selector( ) # really small non-infinite number closest = min( samples, - key=lambda sample: abs(sample.final_answer - mean_score) - if sample.final_answer - else float("inf"), + key=lambda sample: ( + abs(sample.final_answer - mean_score) + if sample.final_answer + else float("inf") + ), ) return PassageWithReasoning( passage=closest.passage, @@ -154,9 +156,9 @@ async def score( sorted_answers = sorted( answers, - key=lambda prs: prs.final_answer - if prs.final_answer is not None - else float("-inf"), + key=lambda prs: ( + prs.final_answer if prs.final_answer is not None else float("-inf") + ), reverse=True, ) return sorted_answers diff --git a/ice/recipes/experiments_and_arms/recipes/name_experiments.py b/ice/recipes/experiments_and_arms/recipes/name_experiments.py index aa6f7736..086fa294 100644 --- a/ice/recipes/experiments_and_arms/recipes/name_experiments.py +++ b/ice/recipes/experiments_and_arms/recipes/name_experiments.py @@ -156,13 +156,15 @@ async def name_experiments( assert experiment_names.final_answer is not None return ( gs_names, - [ - strip_enumeration_prefix(exp_name) - for exp_name in standardized_answer.split("\n") - if exp_name.strip() - ] - if standardized_answer - else [], + ( + [ + strip_enumeration_prefix(exp_name) + for exp_name in standardized_answer.split("\n") + if exp_name.strip() + ] + if standardized_answer + else [] + ), paragraphs_to_keep, [str(p) for p in paragraphs], ) diff --git a/ice/recipes/experiments_and_arms/types.py b/ice/recipes/experiments_and_arms/types.py index db2886f1..ac5de189 100644 --- a/ice/recipes/experiments_and_arms/types.py +++ b/ice/recipes/experiments_and_arms/types.py @@ -16,7 +16,6 @@ from ice.metrics.gold_standards import ParsedGoldStandardBase - T = TypeVar("T") Stage = NewType("Stage", str) diff --git a/ice/recipes/meta/eval_paper_qa/common_baselines.py b/ice/recipes/meta/eval_paper_qa/common_baselines.py index 3b36d502..e2784647 100644 --- a/ice/recipes/meta/eval_paper_qa/common_baselines.py +++ b/ice/recipes/meta/eval_paper_qa/common_baselines.py @@ -197,9 +197,11 @@ async def preselected_few_shot_qa_baseline( Demonstration( question=g.question, texts=g.gold_support, - answer=g.gold_answer - if isinstance(g.gold_answer, str) - else numbered_list(g.gold_answer).transform(), + answer=( + g.gold_answer + if isinstance(g.gold_answer, str) + else numbered_list(g.gold_answer).transform() + ), ) for g in demonstration_examples ] diff --git a/ice/recipes/meta/eval_paper_qa/paper_qa_baseline.py b/ice/recipes/meta/eval_paper_qa/paper_qa_baseline.py index 16e86a0c..ecb6c6d3 100644 --- a/ice/recipes/meta/eval_paper_qa/paper_qa_baseline.py +++ b/ice/recipes/meta/eval_paper_qa/paper_qa_baseline.py @@ -10,12 +10,10 @@ def make_classification_prompt(paragraph: Paragraph, question: str) -> str: - return F( - f"""Here is a paragraph from a research paper: "{paragraph}" + return F(f"""Here is a paragraph from a research paper: "{paragraph}" Question: Does this paragraph answer the question '{question}'? Say Yes or No. -Answer:""" - ) +Answer:""") async def classify_paragraph(paragraph: Paragraph, question: str) -> float: diff --git a/ice/recipes/meta/eval_paper_qa/qa_baseline.py b/ice/recipes/meta/eval_paper_qa/qa_baseline.py index b2a3a53e..9c7919ce 100644 --- a/ice/recipes/meta/eval_paper_qa/qa_baseline.py +++ b/ice/recipes/meta/eval_paper_qa/qa_baseline.py @@ -2,23 +2,20 @@ from ice.recipe import recipe - DEFAULT_CONTEXT = "We're running a hackathon on 9/9/2022 to decompose complex reasoning tasks into subtasks that are easier to automate & evaluate with language models. Our team is currently breaking down reasoning about the quality of evidence in randomized controlled trials into smaller tasks e.g. placebo, intervention adherence rate, blinding procedure, etc." DEFAULT_QUESTION = "What is happening on 9/9/2022?" def make_qa_prompt(context: str, question: str) -> str: - return F( - f""" + return F(f""" Background text: "{context}" Answer the following question about the background text above: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def answer( diff --git a/ice/recipes/meta/eval_paper_qa/types.py b/ice/recipes/meta/eval_paper_qa/types.py index 8d128d32..672018a8 100644 --- a/ice/recipes/meta/eval_paper_qa/types.py +++ b/ice/recipes/meta/eval_paper_qa/types.py @@ -60,8 +60,7 @@ async def __call__( __paper: Paper, __question: str, __gold_support: Optional[Sequence[str]] = None, - ) -> PaperQaAnswer[AnswerType_contra]: - ... + ) -> PaperQaAnswer[AnswerType_contra]: ... class AnswerEvalMethod(Protocol[AnswerType_contra]): @@ -70,8 +69,7 @@ async def __call__( question: str, ground_truth: AnswerType_contra, prediction: AnswerType_contra, - ) -> tuple[bool, str]: - ... + ) -> tuple[bool, str]: ... class ClassificationEvalMethod(Protocol): @@ -81,5 +79,4 @@ async def __call__( predictions: Sequence[bool], ground_truth: Sequence[str], scores: Optional[Sequence[float]] = None, - ) -> BinaryClassificationMetrics: - ... + ) -> BinaryClassificationMetrics: ... diff --git a/ice/recipes/placebo_dialogs.py b/ice/recipes/placebo_dialogs.py index ce562956..c616d851 100644 --- a/ice/recipes/placebo_dialogs.py +++ b/ice/recipes/placebo_dialogs.py @@ -402,9 +402,11 @@ async def analyze_experiment(self, paper: Paper, experiment: Experiment): experiment=experiment, classifications=[ aggregate_used["answer"], - "Placebo" - if has_placebo_info - else "No placebo or placebo not mentioned", + ( + "Placebo" + if has_placebo_info + else "No placebo or placebo not mentioned" + ), ], answer=placebo_result, result=placebo_result, diff --git a/ice/recipes/placebo_keyword_baseline.py b/ice/recipes/placebo_keyword_baseline.py index d13ab3d5..8c19d94a 100644 --- a/ice/recipes/placebo_keyword_baseline.py +++ b/ice/recipes/placebo_keyword_baseline.py @@ -54,9 +54,11 @@ async def run(self, paper: Paper): result=f"{placebo_answer.classification}: {placebo_answer.sentence}", answer=f"{placebo_answer.classification}: {placebo_answer.sentence}", classifications=[ - "Placebo" - if placebo_answer.classification == "Placebo" - else "No placebo or placebo not mentioned", + ( + "Placebo" + if placebo_answer.classification == "Placebo" + else "No placebo or placebo not mentioned" + ), placebo_answer.classification, ], excerpts=[placebo_answer.sentence], diff --git a/ice/recipes/primer/amplify.py b/ice/recipes/primer/amplify.py index 4014b30b..3afa75ef 100644 --- a/ice/recipes/primer/amplify.py +++ b/ice/recipes/primer/amplify.py @@ -4,7 +4,6 @@ from ice.recipes.primer.subquestions import ask_subquestions from ice.utils import map_async - Question = str Answer = str Subs = list[tuple[Question, Answer]] diff --git a/ice/recipes/primer/answer_by_computation.py b/ice/recipes/primer/answer_by_computation.py index 84f62ce0..4d926527 100644 --- a/ice/recipes/primer/answer_by_computation.py +++ b/ice/recipes/primer/answer_by_computation.py @@ -4,19 +4,16 @@ def make_computation_choice_prompt(question: str) -> str: - return F( - f"""You've been asked to answer the question "{question}". + return F(f"""You've been asked to answer the question "{question}". You have access to a Python interpreter. Enter an expression that will help you answer the question. ->>>""" - ) +>>>""") def make_compute_qa_prompt(question: str, expression: str, result: str) -> str: - return F( - f"""A recording of a Python interpreter session: + return F(f"""A recording of a Python interpreter session: >>> {expression}: {result} @@ -24,8 +21,7 @@ def make_compute_qa_prompt(question: str, expression: str, result: str) -> str: Question: "{question}" Answer: " -""" - ).strip() +""").strip() def eval_python(expression: str) -> str: diff --git a/ice/recipes/primer/answer_by_dispatch/prompt.py b/ice/recipes/primer/answer_by_dispatch/prompt.py index 2c5bdf80..7454a562 100644 --- a/ice/recipes/primer/answer_by_dispatch/prompt.py +++ b/ice/recipes/primer/answer_by_dispatch/prompt.py @@ -11,13 +11,11 @@ def make_action_selection_prompt(question: str) -> str: ] ) - return F( - f"""You want to answer the question "{question}". + return F(f"""You want to answer the question "{question}". You have the following options: {action_types_str} Q: Which of these options do you want to use before you answer the question? Choose the option that will most help you give an accurate answer. -A: I want to use option #""" - ).strip() +A: I want to use option #""").strip() diff --git a/ice/recipes/primer/answer_by_dispatch/types.py b/ice/recipes/primer/answer_by_dispatch/types.py index 2277cb2e..d1abe9fa 100644 --- a/ice/recipes/primer/answer_by_dispatch/types.py +++ b/ice/recipes/primer/answer_by_dispatch/types.py @@ -7,8 +7,7 @@ class QuestionRecipe(Protocol): - async def __call__(self, question: str) -> str: - ... + async def __call__(self, question: str) -> str: ... @dataclass diff --git a/ice/recipes/primer/answer_by_reasoning.py b/ice/recipes/primer/answer_by_reasoning.py index 6a793185..0486a3e9 100644 --- a/ice/recipes/primer/answer_by_reasoning.py +++ b/ice/recipes/primer/answer_by_reasoning.py @@ -4,24 +4,20 @@ def generate_reasoning_prompt(question: str) -> str: - return F( - f"""Answer the following question: + return F(f"""Answer the following question: Question: "{question}" Answer: "Let's think step by step. -""" - ).strip() +""").strip() def generate_answer_prompt(question: str, reasoning: str) -> str: - return F( - f"""Answer the following question using the reasoning shown below: + return F(f"""Answer the following question using the reasoning shown below: Question: "{question}" Reasoning: "{reasoning}" Short answer: " -""" - ).strip() +""").strip() async def get_reasoning(question: str) -> str: diff --git a/ice/recipes/primer/answer_by_search.py b/ice/recipes/primer/answer_by_search.py index c2fa03a4..30ca283b 100644 --- a/ice/recipes/primer/answer_by_search.py +++ b/ice/recipes/primer/answer_by_search.py @@ -5,26 +5,22 @@ def make_search_result_prompt(context: str, query: str, question: str) -> str: - return F( - f""" + return F(f""" Search results from Google for the query "{query}": "{context}" Answer the following question, using the search results if helpful: Question: "{question}" Answer: " -""" - ).strip() +""").strip() def make_search_query_prompt(question: str) -> str: - return F( - f""" + return F(f""" You're trying to answer the question {question}. You get to type in a search query to Google, and then you'll be shown the results. What query do you want to search for? Query: " -""" - ).strip('" ') +""").strip('" ') async def search(query: str) -> dict: diff --git a/ice/recipes/primer/answer_by_search_direct.py b/ice/recipes/primer/answer_by_search_direct.py index 406aba41..a41ced0c 100644 --- a/ice/recipes/primer/answer_by_search_direct.py +++ b/ice/recipes/primer/answer_by_search_direct.py @@ -5,16 +5,14 @@ def make_search_result_prompt(context: str, question: str) -> str: - return F( - f""" + return F(f""" Search results from Google: "{context}" Answer the following question, using the search results if helpful: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def search(query: str) -> dict: diff --git a/ice/recipes/primer/chain_of_thought.py b/ice/recipes/primer/chain_of_thought.py index b2cd2ba8..f6ca19c9 100644 --- a/ice/recipes/primer/chain_of_thought.py +++ b/ice/recipes/primer/chain_of_thought.py @@ -4,13 +4,11 @@ def make_chain_of_thought_prompt(question: str, answer_prefix: str = "") -> str: - return F( - f"""Answer the following question: + return F(f"""Answer the following question: Question: "{question}" Answer: "{answer_prefix} -""" - ).strip() +""").strip() async def chain_of_thought( diff --git a/ice/recipes/primer/debate/prompt.py b/ice/recipes/primer/debate/prompt.py index f00d9c56..9aae7778 100644 --- a/ice/recipes/primer/debate/prompt.py +++ b/ice/recipes/primer/debate/prompt.py @@ -4,12 +4,10 @@ def render_debate_prompt(agent_name: str, debate: Debate, turns_left: int) -> str: - prompt = F( - f""" + prompt = F(f""" You are {agent_name}. There are {turns_left} turns left in the debate. You are trying to win the debate using reason and evidence. Don't repeat yourself. No more than 1-2 sentences per turn. {render_debate(debate, agent_name)} You: " -""" - ).strip() +""").strip() return prompt diff --git a/ice/recipes/primer/eval_selective.py b/ice/recipes/primer/eval_selective.py index d266e0fa..6a3ab18e 100644 --- a/ice/recipes/primer/eval_selective.py +++ b/ice/recipes/primer/eval_selective.py @@ -4,14 +4,12 @@ def make_computation_choice_prompt(question: str) -> str: - return F( - f"""You've been asked to answer the question "{question}". + return F(f"""You've been asked to answer the question "{question}". You have access to a Python interpreter. Enter an expression that will help you answer the question. ->>>""" - ) +>>>""") def eval_python(expression: str) -> str: diff --git a/ice/recipes/primer/paper_qa.py b/ice/recipes/primer/paper_qa.py index edc999c8..0a6a89f7 100644 --- a/ice/recipes/primer/paper_qa.py +++ b/ice/recipes/primer/paper_qa.py @@ -8,12 +8,10 @@ def make_classification_prompt(paragraph: Paragraph, question: str) -> str: - return F( - f"""Here is a paragraph from a research paper: "{paragraph}" + return F(f"""Here is a paragraph from a research paper: "{paragraph}" Question: Does this paragraph answer the question '{question}'? Say Yes or No. -Answer:""" - ) +Answer:""") async def classify_paragraph(paragraph: Paragraph, question: str) -> float: diff --git a/ice/recipes/primer/paper_qa_class.py b/ice/recipes/primer/paper_qa_class.py index d86a8b7b..e044998b 100644 --- a/ice/recipes/primer/paper_qa_class.py +++ b/ice/recipes/primer/paper_qa_class.py @@ -6,13 +6,11 @@ def make_prompt(paragraph: Paragraph, question: str) -> str: - return F( - f""" + return F(f""" Here is a paragraph from a research paper: "{paragraph}" Question: Does this paragraph answer the question '{question}'? Say Yes or No. -Answer:""" - ).strip() +Answer:""").strip() async def classify_paragraph(paragraph: Paragraph, question: str) -> float: diff --git a/ice/recipes/primer/paper_qa_classes.py b/ice/recipes/primer/paper_qa_classes.py index 7755aad3..c6e2d292 100644 --- a/ice/recipes/primer/paper_qa_classes.py +++ b/ice/recipes/primer/paper_qa_classes.py @@ -7,12 +7,10 @@ def make_prompt(paragraph: Paragraph, question: str) -> str: - return F( - f"""Here is a paragraph from a research paper: "{paragraph}" + return F(f"""Here is a paragraph from a research paper: "{paragraph}" Question: Does this paragraph answer the question '{question}'? Say Yes or No. -Answer:""" - ) +Answer:""") async def classify_paragraph(paragraph: Paragraph, question: str) -> float: diff --git a/ice/recipes/primer/paper_qa_ranker.py b/ice/recipes/primer/paper_qa_ranker.py index 7b1d5b8e..e9abb14a 100644 --- a/ice/recipes/primer/paper_qa_ranker.py +++ b/ice/recipes/primer/paper_qa_ranker.py @@ -7,12 +7,10 @@ def make_classification_prompt(paragraph: Paragraph, question: str) -> str: - return F( - f"""Here is a paragraph from a research paper: "{paragraph}" + return F(f"""Here is a paragraph from a research paper: "{paragraph}" Question: Does this paragraph answer the question '{question}'? Say Yes or No. -Answer:""" - ) +Answer:""") async def classify_paragraph(paragraph: Paragraph, question: str) -> float: diff --git a/ice/recipes/primer/qa.py b/ice/recipes/primer/qa.py index 7099ff69..30e42bfd 100644 --- a/ice/recipes/primer/qa.py +++ b/ice/recipes/primer/qa.py @@ -2,23 +2,20 @@ from ice.recipe import recipe - DEFAULT_CONTEXT = "We're running a hackathon on 9/9/2022 to decompose complex reasoning tasks into subtasks that are easier to automate & evaluate with language models. Our team is currently breaking down reasoning about the quality of evidence in randomized controlled trials into smaller tasks e.g. placebo, intervention adherence rate, blinding procedure, etc." DEFAULT_QUESTION = "What is happening on 9/9/2022?" def make_qa_prompt(context: str, question: str) -> str: - return F( - f""" + return F(f""" Background text: "{context}" Answer the following question about the background text above: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def answer( diff --git a/ice/recipes/primer/qa_simple.py b/ice/recipes/primer/qa_simple.py index 2950b71a..55b1a5ad 100644 --- a/ice/recipes/primer/qa_simple.py +++ b/ice/recipes/primer/qa_simple.py @@ -4,13 +4,11 @@ def make_qa_prompt(question: str) -> str: - return F( - f"""Answer the following question: + return F(f"""Answer the following question: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def answer(question: str = "What is happening on 9/9/2022?"): diff --git a/ice/recipes/primer/search_json.py b/ice/recipes/primer/search_json.py index 734bd741..b77461e3 100644 --- a/ice/recipes/primer/search_json.py +++ b/ice/recipes/primer/search_json.py @@ -5,16 +5,14 @@ def make_qa_prompt(context: str, question: str) -> str: - return F( - f""" + return F(f""" Background text: "{context}" Answer the following question about the background text above: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def search(query: str = "Who is the president of the United States?") -> dict: diff --git a/ice/recipes/primer/sequential_action.py b/ice/recipes/primer/sequential_action.py index 50b8a840..a8d994de 100644 --- a/ice/recipes/primer/sequential_action.py +++ b/ice/recipes/primer/sequential_action.py @@ -10,7 +10,6 @@ from ice.recipe import recipe from ice.recipes.primer.search_string import search_string - Log = list[str] @@ -23,13 +22,11 @@ def render_context(question: str, log: Log) -> str: question_context = F(f'The question you want to answer: "{question}"') if not log: return question_context - return F( - f"""{question_context} + return F(f"""{question_context} What you've done so far: -{render_enumerate(log)}""" - ) +{render_enumerate(log)}""") def render_action_context(question: str, log: Log, max_actions: int) -> str: @@ -40,29 +37,23 @@ def render_action_context(question: str, log: Log, max_actions: int) -> str: f"You have {max_actions} actions left. (The one you're taking right now, and {max_actions - 1} follow-up actions.)" ) ) - return F( - f"""{render_context(question, log)} + return F(f"""{render_context(question, log)} -{action_count_text}""" - ) +{action_count_text}""") def make_knowledge_prompt(question: str, log: Log) -> str: - return F( - f"""{render_context(question, log)} + return F(f"""{render_context(question, log)} Q: Do you have enough information to correctly answer the question? Say "A: Yes" or "A: No" -A:""" - ) +A:""") def make_answer_prompt(question: str, log: Log) -> str: - return F( - f"""{render_context(question, log)} + return F(f"""{render_context(question, log)} Q: {question} -A:""" - ) +A:""") async def is_info_sufficient(question: str, log: Log) -> bool: @@ -82,16 +73,13 @@ async def answer_directly(question: str, log: Log) -> str: class Action(ABC): @classmethod @abstractmethod - async def propose(cls, question: str, log: Log, max_actions: int) -> "Action": - ... + async def propose(cls, question: str, log: Log, max_actions: int) -> "Action": ... @abstractmethod - def run(self): - ... + def run(self): ... @abstractmethod - def make_log_entry(self, result: str) -> str: - ... + def make_log_entry(self, result: str) -> str: ... @dataclass @@ -100,16 +88,14 @@ class CalculationAction(Action): @classmethod def make_proposal_prompt(cls, question: str, log: Log, max_actions: int) -> str: - return F( - f"""{render_action_context(question, log, max_actions)} + return F(f"""{render_action_context(question, log, max_actions)} You have chosen to take the action "Do a calculation". You have access to a Python interpreter. What single-line calculation would most help you answer the question "{question}"? >>> import math ->>>""" - ) +>>>""") @classmethod async def propose( @@ -140,15 +126,13 @@ class WebSearchAction(Action): @classmethod def make_proposal_prompt(cls, question: str, log: Log, max_actions: int) -> str: - return F( - f"""{render_action_context(question, log, max_actions)} + return F(f"""{render_action_context(question, log, max_actions)} You have chosen to take the action "Run a web search". What is a first web search query you could run to help you answer the question "{question}"? -Query:""" - ) +Query:""") @classmethod async def propose( @@ -195,16 +179,14 @@ def make_action_choice_prompt( else F(f", and {max_actions - 1} similar follow-up actions") ) - return F( - f"""{render_context(question, log)} + return F(f"""{render_context(question, log)} You can take one of the following actions now{follow_up_text} before you need to answer: {render_enumerate(actions)} Question: What next action should you take to make progress on answering the question "{question}"? {render_numbers(len(actions))}? -Answer:""" - ) +Answer:""") async def choose_action( diff --git a/ice/recipes/primer/subquestions_answered.py b/ice/recipes/primer/subquestions_answered.py index 99840017..00caf9e4 100644 --- a/ice/recipes/primer/subquestions_answered.py +++ b/ice/recipes/primer/subquestions_answered.py @@ -6,13 +6,11 @@ def make_qa_prompt(question: str) -> str: - return F( - f"""Answer the following question: + return F(f"""Answer the following question: Question: "{question}" Answer: " -""" - ).strip() +""").strip() async def answer(question: str) -> str: diff --git a/ice/recipes/primer/verify/last.py b/ice/recipes/primer/verify/last.py index 52aa78c2..e266ad5c 100644 --- a/ice/recipes/primer/verify/last.py +++ b/ice/recipes/primer/verify/last.py @@ -5,16 +5,14 @@ def make_verification_prompt(question: str, steps: list[str]) -> str: - return F( - f"""Consider this question: "{question}" + return F(f"""Consider this question: "{question}" Here are the first few steps of an answer: {render_steps(steps)} Q: Is step {len(steps)} correct, assuming that the previous steps are correct? Say "A: Yes" or "A: No". -A:""" - ) +A:""") async def check_step(question: str, steps: list[str]) -> float: diff --git a/ice/recipes/primer/verify_answer.py b/ice/recipes/primer/verify_answer.py index 7cc8ebde..7f225db0 100644 --- a/ice/recipes/primer/verify_answer.py +++ b/ice/recipes/primer/verify_answer.py @@ -4,14 +4,12 @@ def make_verification_prompt(question: str, answer: str) -> str: - return F( - f"""Consider this question: "{question}" + return F(f"""Consider this question: "{question}" Potential answer: "{answer}" Q: Is the potential answer above correct? Say "A: Yes" or "A: No". -A:""" - ) +A:""") async def verify_answer(question: str, answer: str) -> float: diff --git a/ice/recipes/program_search/nodes/answer/generate_reasoning/prompts.py b/ice/recipes/program_search/nodes/answer/generate_reasoning/prompts.py index 6b1508b9..62488b3e 100644 --- a/ice/recipes/program_search/nodes/answer/generate_reasoning/prompts.py +++ b/ice/recipes/program_search/nodes/answer/generate_reasoning/prompts.py @@ -7,7 +7,6 @@ from ice.recipes.program_search.nodes.answer.types import Demonstration from ice.recipes.program_search.nodes.answer.types import DemonstrationWithReasoning - INSTRUCTIONS = "My friend came up with the following correct answers for each question but didn't write out his reasoning that supports each answer. Can you write a paragraph explaining why each answer is correct? Be sure to quote the parts of the text that support the answer, explaining why this is the correct conclusion." DEMONSTRATION_EXAMPLE = """ diff --git a/ice/recipes/program_search/nodes/decontext/prompts.py b/ice/recipes/program_search/nodes/decontext/prompts.py index 6ac8d7d6..6aab54c9 100644 --- a/ice/recipes/program_search/nodes/decontext/prompts.py +++ b/ice/recipes/program_search/nodes/decontext/prompts.py @@ -83,12 +83,16 @@ def decontext_prompt( if questions: last_example["questions"] = numbered_list(questions) examples = format_multi( - QUESTION_GUIDED_EXAMPLE_TEMPLATE - if questions - else QUESTION_FREE_EXAMPLE_TEMPLATE, - QUESTION_GUIDED_EXAMPLES + [last_example] - if questions - else QUESTION_FREE_EXAMPLES + [last_example], + ( + QUESTION_GUIDED_EXAMPLE_TEMPLATE + if questions + else QUESTION_FREE_EXAMPLE_TEMPLATE + ), + ( + QUESTION_GUIDED_EXAMPLES + [last_example] + if questions + else QUESTION_FREE_EXAMPLES + [last_example] + ), ) return "\n\n".join( ( diff --git a/ice/recipes/program_search/nodes/select/prompts.py b/ice/recipes/program_search/nodes/select/prompts.py index e2c0dad1..3459dfce 100644 --- a/ice/recipes/program_search/nodes/select/prompts.py +++ b/ice/recipes/program_search/nodes/select/prompts.py @@ -114,9 +114,11 @@ def render_selection_example( question=question, existing=numbered_list(example.existing) if example.existing else NO_EXISTING, texts=numbered_list([str(text) for text in example.selection]), - selections=NONE_ANSWER - if not example.positive_idxs - else str(example.positive_idxs[0] + 1), + selections=( + NONE_ANSWER + if not example.positive_idxs + else str(example.positive_idxs[0] + 1) + ), NONE_ANSWER=NONE_ANSWER, ) diff --git a/ice/recipes/program_search/nodes/select/select.py b/ice/recipes/program_search/nodes/select/select.py index 7107cf5a..5fc77101 100644 --- a/ice/recipes/program_search/nodes/select/select.py +++ b/ice/recipes/program_search/nodes/select/select.py @@ -305,9 +305,11 @@ async def select_using_elicit_prompt_few_shot( if include_negative: demonstrations_or_none = [ - (await elicit_negative_few_shot_example(example, max_examples=1)) - if idx % 3 == 0 # more positive than negative examples - else (await positive_few_shot_example(example, max_examples=1)) + ( + (await elicit_negative_few_shot_example(example, max_examples=1)) + if idx % 3 == 0 # more positive than negative examples + else (await positive_few_shot_example(example, max_examples=1)) + ) for idx, example in enumerate(examples) ] else: diff --git a/ice/recipes/program_search/types.py b/ice/recipes/program_search/types.py index 928251ba..1d1fe3ec 100644 --- a/ice/recipes/program_search/types.py +++ b/ice/recipes/program_search/types.py @@ -75,8 +75,7 @@ class Trace(BaseModel): T = t.TypeVar("T") -class Beam(GenericModel, t.Generic[T]): - ... +class Beam(GenericModel, t.Generic[T]): ... def remove_lowest_perplexity(results: t.Sequence[tuple[str, float]]): diff --git a/ice/recipes/program_search/utils/find_examples.py b/ice/recipes/program_search/utils/find_examples.py index 193e5bf2..abc0def3 100644 --- a/ice/recipes/program_search/utils/find_examples.py +++ b/ice/recipes/program_search/utils/find_examples.py @@ -77,9 +77,9 @@ async def rouge_distractor_scores( scores = await rouge_texts(hypotheses=hypotheses, references=references) return { text: ( - lambda s: s.rouge_l.r - if s.rouge_l.r < lcs_threshold and s.rouge_3.r == 0 - else 0 + lambda s: ( + s.rouge_l.r if s.rouge_l.r < lcs_threshold and s.rouge_3.r == 0 else 0 + ) )(score) for text, score in scores.items() } diff --git a/ice/trace.py b/ice/trace.py index 33a28a29..d9cb8b2e 100644 --- a/ice/trace.py +++ b/ice/trace.py @@ -309,8 +309,7 @@ def __new__(mcls, name, bases, namespace): ) -class TracedABC(metaclass=TracedABCMeta): - ... +class TracedABC(metaclass=TracedABCMeta): ... # TODO this and the functions it calls needs to be replaced with a better system diff --git a/main.py b/main.py index f1283fd9..b23e5308 100644 --- a/main.py +++ b/main.py @@ -20,7 +20,6 @@ from ice.trace import trace from ice.utils import map_async - log = get_logger() diff --git a/tests/test_main.py b/tests/test_main.py index 85efd8e3..27bde091 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -8,7 +8,6 @@ from ice.recipes import get_recipe_classes from main import main_cli - nest_asyncio.apply() diff --git a/tests/test_trace.py b/tests/test_trace.py index efdf5c13..748185ba 100644 --- a/tests/test_trace.py +++ b/tests/test_trace.py @@ -88,10 +88,8 @@ def test_get_strings(): # Nested lists are not flattened assert trace.get_strings([["a", "b"], ["c", "d"]]) == ["a", "b"] - assert trace.get_strings(pd.DataFrame({"a": [1, 2, 3]})) == [ - """\ + assert trace.get_strings(pd.DataFrame({"a": [1, 2, 3]})) == ["""\ a 0 1 1 2 -2 3""" - ] +2 3"""]