From 250dbb8eaad9ad1a757f6f692b351912d831c84d Mon Sep 17 00:00:00 2001 From: "Marcus D. Collins" Date: Wed, 25 Mar 2026 11:58:50 -0700 Subject: [PATCH 1/3] feat:add tortoize (bb and sc dihedral z-score) script --- EVALUATION.md | 0 scripts/eval/run_and_process_tortoize.py | 149 ++++++++++++++++++ ...cif_files.py => patch_output_cif_files.py} | 0 3 files changed, 149 insertions(+) create mode 100644 EVALUATION.md create mode 100644 scripts/eval/run_and_process_tortoize.py rename scripts/{patch_input_cif_files.py => patch_output_cif_files.py} (100%) diff --git a/EVALUATION.md b/EVALUATION.md new file mode 100644 index 00000000..e69de29b diff --git a/scripts/eval/run_and_process_tortoize.py b/scripts/eval/run_and_process_tortoize.py new file mode 100644 index 00000000..d8edc4f7 --- /dev/null +++ b/scripts/eval/run_and_process_tortoize.py @@ -0,0 +1,149 @@ +import json +import subprocess +from pathlib import Path +from typing import Any + +import joblib +import pandas as pd +from loguru import logger +from pandas import DataFrame + +from sampleworks.eval.eval_dataclasses import Trial +from sampleworks.eval.grid_search_eval_utils import parse_eval_args, setup_evaluation_parameters + + +# TODO make more general: https://github.com/diff-use/sampleworks/issues/93 +def main(args) -> None: + # check that phenix is installed and available, bail early if not. + try: + subprocess.call("tortoize", stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except FileNotFoundError: + raise RuntimeError( + "tortoize is not available, make sure you have installed it." + ) + # The dropped variable is a list of ProteinConfigs, not used yet in this script + all_trials, _ = setup_evaluation_parameters(args) + + # Now loop over trials with joblib and get back tuples of trial level metrics + tortoize_results = joblib.Parallel(n_jobs=args.n_jobs)( + joblib.delayed(get_stats_for_single_path)(trial.refined_cif_path) for trial in all_trials + ) + if not tortoize_results: + logger.error("No trials successfully processed, check that result files are available.") + return + + all_residue_results, all_protein_results = tuple(zip(*tortoize_results, strict=True)) + + output_file = f"tortoize_residues.csv" + pd.concat(all_residue_results).to_csv(output_file, index=False) + logger.info(f"Residue results saved to {output_file}") + + output_file = f"tortoize_protein_stats.csv" + pd.concat(all_protein_results).to_csv(output_file, index=False) + logger.info(f"Protein-level stats saved to {output_file}") + + +def flatten_residues(tortoize_json: dict[str, Any]) -> pd.DataFrame: + """ + Flattens tortoize JSON into one dict per residue across all models. + See test/1cbs.json for an example of the tortoize output JSON + + Output keys per residue: + - model (e.g. "1") + - asymID + - compID + - seqID + - ss_type (taken from ramachandran["ss-type"] if present, else torsion["ss-type"]) + - ramachandran_z_score (ramachandran["z-score"]) + - torsion_z_score (torsion["z-score"]) + """ + out: list[dict[str, Any]] = [] + + # This is possibly over-robust. Claude Sonnet wrote it and I checked/modified it. + model_block = tortoize_json.get("model", {}) + for model_id, model_data in model_block.items(): + residues = (model_data or {}).get("residues", []) + if not residues: + logger.warning(f"No residues found in model {model_id}") + continue + for r in residues: + rama = (r or {}).get("ramachandran", {}) + tors = (r or {}).get("torsion", {}) + pdb_info = (r or {}).get("pdb", {}) + + ss_type: str | None = rama.get("ss-type", None) + if ss_type is None: + ss_type = tors.get("ss-type", None) + + out.append( + { + "model": str(model_id), + "asymID": r.get("asymID", None), + "compID": r.get("compID", None), + "seqID": r.get("seqID", None), + "strandID": pdb_info.get("strandID", None), + "insCode": pdb_info.get("insCode", None), + "ss_type": ss_type, + "ramachandran_z_score": rama.get("z-score", None), + "torsion_z_score": tors.get("z-score", None), + } + ) + + return pd.DataFrame(out) + + +def get_protein_level_z_scores(tortoize_json: dict[str, Any]) -> pd.DataFrame: + """ + Extracts protein-level z-scores for torsion and Ramachandran angles from tortoize JSON output. + See test/1cbs.json for an example of the tortoize output JSON + + + :param tortoize_json: + :return: pd.DataFrame with keys: + - model (e.g. "1") + - ramachandran_z_score + - torsion_z_score + - ramachandran_jackknife_sd + - torsion_jackknife_sd + - residue_count + """ + + out: list[dict[str, Any]] = [] + model_block = tortoize_json.get("model", {}) + for model_id, model_data in model_block.items(): + out.append({ + "model": str(model_id), + "ramachandran_z_score": model_data.get("ramachandran-z", None), + "ramachandran_jackknife_sd": model_data.get("ramachandran-jackknife-sd", None), + "torsion_z_score": model_data.get("torsion-z", None), + "torsion_jackknife_sd": model_data.get("torsion-jackknife-sd", None) + }) + return pd.DataFrame(out) + + +def get_stats_for_single_path(path: Path) -> tuple[DataFrame, DataFrame]: + logger.info(f"Processing {path}") + try: + output = subprocess.check_output(f"tortoize {path}".split()) + result = json.loads(output) + except Exception as e: + logger.error(f"Failed to process {path}: {e}") + return pd.DataFrame(), pd.DataFrame() + + residues = flatten_residues(result) + if residues.empty: + logger.warning(f"No residues found in {path}") + return pd.DataFrame(), pd.DataFrame() + + residues["path"] = path + + protein_level_stats = get_protein_level_z_scores(result) + protein_level_stats["path"] = path + return residues, protein_level_stats + + +if __name__ == "__main__": + argparse_description = "Crawl the workspace root for CIF files matching " + argparse_description += "--target-filename and run tortoize on them." + eval_args = parse_eval_args(description=argparse_description) + main(eval_args) diff --git a/scripts/patch_input_cif_files.py b/scripts/patch_output_cif_files.py similarity index 100% rename from scripts/patch_input_cif_files.py rename to scripts/patch_output_cif_files.py From 3c087f1085629866b9ed6bff0804c065c5f38436 Mon Sep 17 00:00:00 2001 From: "Marcus D. Collins" Date: Wed, 25 Mar 2026 12:00:06 -0700 Subject: [PATCH 2/3] fix:add documentation of evaluation scripts, resolves https://github.com/diff-use/sampleworks/issues/123 --- EVALUATION.md | 188 +++++++++++++++++++++++ scripts/eval/run_and_process_tortoize.py | 4 +- 2 files changed, 190 insertions(+), 2 deletions(-) diff --git a/EVALUATION.md b/EVALUATION.md index e69de29b..efa77eed 100644 --- a/EVALUATION.md +++ b/EVALUATION.md @@ -0,0 +1,188 @@ +# Evaluation of SampleWorks Grid Search Results + +# External software requirements +## tortoize +SampleWorks relies on tortoize to compute backbone and sidechain dihedral angle outliers. +`tortoize` is free software and can be downloaded from https://github.com/PDB-REDO/tortoize. +You should install it following their instructions and make sure it is available in the environment +where you run SampleWorks. The script scripts/eval/run_and_process_tortoize.py will check for the +`tortoize` executable before running and will raise an error if it is not available. + +## phenix +Information about the phenix package can be found at https://phenix-online.org/. Phenix requires a +license which is free to academic users. Others may have to pay a fee. Sampleworks makes use of the +phenix.clashscore command and `run_and_process_phenix_clashscore.py` will check for it before +running, raising an error if it is not available. + +# Running the evaluations +## Preparing the output CIF files +As of this writing, Sampleworks outputs CIF files that primarily contain the output atomic +coordinates, and not the additional information that many programs, like `tortoize` and +`phenix.clashscore`, require. Furthermore, many protein structure predictors effectively +renumber residues. Since our metrics are frequently calculated by comparing selections of atoms or +residues, we must align to the original _sequence_ of the protein as well. Future versions of +Sampleworks will handle these issues automatically. For now, you should run the script +`scripts/patch_output_cif_files.py`. This will use the original PDB inputs to reconstruct proper +output CIF files that are numbered correctly and +have all necessary metadata to reconstruct the protein structure correctly. + +You can run the following command, which assumes: +- your sampleworks output is stored in `/home/ubuntu/grid_search_results`, +- the output is organized by RCSB PDB ID in directories like `/home/ubuntu/grid_search_results/1VME/...`, + see the `--rcsb-pattern` argument which is a regex to match the RCSB PDB ID +- the input PDB cif files are stored in `/home/ubuntu/grid_search_inputs` as required for running the + the grid search (see GRID_SEARCH.md) +- the input PDB cif files are stored in `/home/ubuntu/grid_search_inputs` as, e.g., + `/home/ubuntu/grid_search_inputs/1VME/1VME_original.cif`, see the `--input-pdb-pattern` argument, which + is a python format string which must use the `pdb_id` variable to refer to the RCSB PDB ID. + +```shell +pixi run -e analysis python scripts/patch_output_cif_files.py \ + --input-dir /home/ubuntu/grid_search_results \ + --rcsb-pattern 'grid_search_results/(.{4})/...' \ + --cif-pattern 'refined.cif' \ + --grid-search-input-dir /home/ubuntu/grid_search_inputs \ + --input-pdb-pattern '{pdb_id}/{pdb_id}_original.cif' +``` + +This script searches recursively for all CIF files under the input directory, by default up to 4 +levels deep. If you organize the output more deeply, you can specify the depth with the `--depth` +argument. It will output a patched CIF files named `refined-patched.cif` along each original `refined.cif` +file. These `refined-patched.cif` files can be used as input to the remaining evaluation scripts. + +## Running the scripts +The evaluation scripts have a common interface defined by the method +`sampleworks.eval.grid_search_eval_utils.parse_eval_args`. The general form of these commands is: + +```shell +pixi run -e analysis python scripts/eval/