From 82fe1678eb8ad525a9c8bd8533eee16de474607f Mon Sep 17 00:00:00 2001 From: xraymemory Date: Thu, 28 May 2026 12:36:47 -0400 Subject: [PATCH 1/4] feat(runs): add analysis preset harness --- Dockerfile | 8 +- README.md | 26 ++- analyses/all.toml | 56 +++++ analyses/altloc_classify.toml | 25 ++ analyses/altloc_find.toml | 23 ++ analyses/external_tools.toml | 32 +++ analyses/grid_search.toml | 40 ++++ docker-entrypoint.sh | 19 +- pyproject.toml | 2 + run_analysis | 338 +++++++++++++++++++++++++++ run_analysis.sh | 17 ++ scripts/eval/EVALUATION.md | 17 ++ src/sampleworks/runs/__init__.py | 6 +- src/sampleworks/runs/analysis_cli.py | 51 ++++ src/sampleworks/runs/cli.py | 204 +++++++++++----- src/sampleworks/runs/loader.py | 124 +++++++--- src/sampleworks/runs/runner.py | 154 ++++++++++-- src/sampleworks/runs/schema.py | 49 +++- tests/runs/test_runner.py | 82 ++++++- 19 files changed, 1142 insertions(+), 131 deletions(-) create mode 100644 analyses/all.toml create mode 100644 analyses/altloc_classify.toml create mode 100644 analyses/altloc_find.toml create mode 100644 analyses/external_tools.toml create mode 100644 analyses/grid_search.toml create mode 100755 run_analysis create mode 100755 run_analysis.sh create mode 100644 src/sampleworks/runs/analysis_cli.py diff --git a/Dockerfile b/Dockerfile index 439421e3..0d96eaac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -120,7 +120,8 @@ COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints # smaller CI runners (ubuntu-latest can be 72 GB or 145 GB). RUN pixi install -e boltz --frozen && \ pixi install -e protenix --frozen && \ - pixi install -e rf3 --frozen + pixi install -e rf3 --frozen && \ + pixi install -e analysis --frozen # ============================================================================ # Pre-compile CUDA extensions to avoid JIT compilation at runtime @@ -133,10 +134,11 @@ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-comp # This image carries pixi environments and checkpoints. Runtime source should # come from ACTL's synced checkout at /home/dev/workspace, not from stale code # baked into /app during image construction. -RUN rm -rf /app/src /app/scripts /app/experiments /app/run_grid_search.py \ +RUN rm -rf /app/src /app/scripts /app/experiments /app/analyses \ + /app/run_grid_search.py /app/run_analysis \ && mkdir -p /home/dev/workspace -COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/ +COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh run_analysis run_analysis.sh /usr/local/bin/ RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n cd /home/dev/workspace\nfi\n' >> /root/.bashrc ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app \ diff --git a/README.md b/README.md index f9f31aa8..a069a7c0 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,8 @@ Output layout: `grid_search_results//[_]//ens > **Note**: Jobs are skipped if a `refined.cif` file already exists in the output directory. Some flags (e.g., `--use-tweedie`, `--gradient-normalization`) are not reflected in the directory structure, so changing them alone won't trigger a re-run. Use `--force-all` to re-run all jobs regardless. This is under active development and will likely change soon. -Instructions for running evaluation and metrics scripts are coming soon. +Evaluation and metrics scripts can be run through `run_analysis`; see the ACTL +section below and `scripts/eval/EVALUATION.md`. ## Running preset experiments on ACTL (`run_experiments`) @@ -246,6 +247,29 @@ debugging only, opt into an on-pod pixi update with rebuilt `pixi-with-checkpoints:sampleworks` image instead. +## Running preset analyses on ACTL (`run_analysis`) + +`run_analysis` uses the same TOML runner as `run_experiments`, but loads presets +from `analyses/*.toml` and runs the scripts under `scripts/eval/`. + +```bash +export GRID_SEARCH_RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks//full_8gpu +export GRID_SEARCH_INPUTS_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps +export PROTEIN_CONFIGS_CSV="$GRID_SEARCH_INPUTS_DIR/protein_analysis_config.csv" + +run_analysis --list +run_analysis --dry-run rscc +run_analysis grid_search --jobs rscc,lddt +run_analysis altloc_find +run_analysis altloc_classify +run_analysis all # includes tortoize and phenix.clashscore jobs +``` + +Use `--set` for one-off changes, for example +`run_analysis rscc --set shared_args.target-filename=refined.cif` or +`run_analysis rscc --set jobs.rscc.gpus=0`. + + ## Docker TODO: Docker container documentation diff --git a/analyses/all.toml b/analyses/all.toml new file mode 100644 index 00000000..02294cfa --- /dev/null +++ b/analyses/all.toml @@ -0,0 +1,56 @@ +description = "All grid-search evaluations, including optional external-tool checks." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" + +[[jobs]] +name = "rscc" +env = "analysis" +gpu_count = 1 +script = "scripts/eval/rscc_grid_search_script.py" +output_arg = "" +output_subdir = "analysis/rscc" + +[[jobs]] +name = "lddt" +env = "analysis" +gpus = "none" +script = "scripts/eval/lddt_evaluation_script.py" +output_arg = "" +output_subdir = "analysis/lddt" + +[[jobs]] +name = "bond_geometry" +env = "analysis" +gpus = "none" +script = "scripts/eval/bond_geometry_eval.py" +output_arg = "" +output_subdir = "analysis/bond_geometry" + +[[jobs]] +name = "tortoize" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_tortoize.py" +output_arg = "" +output_subdir = "analysis/tortoize" + +[[jobs]] +name = "phenix_clashscore" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_phenix_clashscore.py" +output_arg = "" +output_subdir = "analysis/phenix_clashscore" diff --git a/analyses/altloc_classify.toml b/analyses/altloc_classify.toml new file mode 100644 index 00000000..cd7cf730 --- /dev/null +++ b/analyses/altloc_classify.toml @@ -0,0 +1,25 @@ +description = "Classify altloc selections into side-chain, loop, and domain-shift categories." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +ALTLOC_SELECTIONS_CSV = "${GRID_SEARCH_RESULTS_DIR}/analysis/altloc_selections.csv" +ALTLOC_CLASSIFICATIONS_CSV = "${GRID_SEARCH_RESULTS_DIR}/analysis/altloc_region_classifications.csv" +CIF_ROOT = "${GRID_SEARCH_INPUTS_DIR}" +DOMAIN_SHIFT_MIN_SPAN = "50" +LOOP_LDDT_THRESHOLD = "0.75" + +[shared_args] +input-csv = "${ALTLOC_SELECTIONS_CSV}" +cif-root = "${CIF_ROOT}" +output-file = "${ALTLOC_CLASSIFICATIONS_CSV}" +domain-shift-min-span = "${DOMAIN_SHIFT_MIN_SPAN}" +loop-lddt-threshold = "${LOOP_LDDT_THRESHOLD}" + +[[jobs]] +name = "classify_altloc_regions" +env = "analysis" +gpus = "none" +script = "scripts/eval/classify_altloc_regions.py" +output_arg = "" +output_subdir = "analysis/altloc_classify" diff --git a/analyses/altloc_find.toml b/analyses/altloc_find.toml new file mode 100644 index 00000000..e6d2718b --- /dev/null +++ b/analyses/altloc_find.toml @@ -0,0 +1,23 @@ +description = "Build an analysis protein-config CSV by finding altloc selections in input CIFs." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEINS_CSV = "${GRID_SEARCH_INPUTS_DIR}/proteins.csv" +ALTLOC_SELECTIONS_CSV = "${GRID_SEARCH_RESULTS_DIR}/analysis/altloc_selections.csv" +ALTLOC_MIN_SPAN = "5" +ALTLOC_LABEL = "label_alt_id" + +[shared_args] +input-csv = "${PROTEINS_CSV}" +output-file = "${ALTLOC_SELECTIONS_CSV}" +min-span = "${ALTLOC_MIN_SPAN}" +altloc-label = "${ALTLOC_LABEL}" + +[[jobs]] +name = "find_altloc_selections" +env = "analysis" +gpus = "none" +script = "scripts/eval/find_altloc_selections.py" +output_arg = "" +output_subdir = "analysis/altloc_find" diff --git a/analyses/external_tools.toml b/analyses/external_tools.toml new file mode 100644 index 00000000..2ffa8226 --- /dev/null +++ b/analyses/external_tools.toml @@ -0,0 +1,32 @@ +description = "Evaluation jobs that require external executables: tortoize and phenix.clashscore." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" + +[[jobs]] +name = "tortoize" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_tortoize.py" +output_arg = "" +output_subdir = "analysis/tortoize" + +[[jobs]] +name = "phenix_clashscore" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_phenix_clashscore.py" +output_arg = "" +output_subdir = "analysis/phenix_clashscore" diff --git a/analyses/grid_search.toml b/analyses/grid_search.toml new file mode 100644 index 00000000..5cf68419 --- /dev/null +++ b/analyses/grid_search.toml @@ -0,0 +1,40 @@ +description = "Core grid-search evaluations: RSCC, LDDT clustering, and bond geometry." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" + +[[jobs]] +name = "rscc" +env = "analysis" +gpu_count = 1 +script = "scripts/eval/rscc_grid_search_script.py" +output_arg = "" +output_subdir = "analysis/rscc" + +[[jobs]] +name = "lddt" +env = "analysis" +gpus = "none" +script = "scripts/eval/lddt_evaluation_script.py" +output_arg = "" +output_subdir = "analysis/lddt" + +[[jobs]] +name = "bond_geometry" +env = "analysis" +gpus = "none" +script = "scripts/eval/bond_geometry_eval.py" +output_arg = "" +output_subdir = "analysis/bond_geometry" diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 0477a7dc..facdebea 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -6,7 +6,7 @@ # docker run pixi-with-checkpoints -e boltz run_grid_search.py --proteins /data/proteins.csv ... # docker run pixi-with-checkpoints bash # interactive shell # -# Available pixi environments: boltz, protenix, rf3 +# Available pixi environments: boltz, protenix, rf3, analysis # # Examples: # # Run grid search with RF3 @@ -36,7 +36,7 @@ IMPORTANT: Always use --shm-size=16g (or larger) to avoid shared memory errors with DataLoaders. OPTIONS: - -e, --env Pixi environment to use (boltz, protenix, rf3) + -e, --env Pixi environment to use (boltz, protenix, rf3, analysis) -h, --help Show this help message bash Start an interactive shell @@ -44,6 +44,7 @@ ENVIRONMENTS: boltz For boltz1 and boltz2 models protenix For protenix model rf3 For RF3 model + analysis For scripts/eval analysis jobs EXAMPLES: # Run grid search with RF3 model @@ -190,6 +191,10 @@ PROTEINS CSV FORMAT: 1abc,/data/structures/1abc.cif,/data/maps/1abc.ccp4,2.0 2xyz,/data/structures/2xyz.cif,/data/maps/2xyz.mrc,1.8 +ACTL helper commands: + run_experiments Run experiments/*.toml presets + run_analysis Run analyses/*.toml presets for scripts/eval + For full argument details, run: docker run pixi-with-checkpoints -e boltz run_grid_search.py --help EOF @@ -202,7 +207,7 @@ if [ $# -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then fi # Handle scientist workflow helpers and interactive shells -if [ "$1" = "run_experiments" ] || [ "$1" = "run_experiments.sh" ] || [ "$1" = "run_all_models.sh" ]; then +if [ "$1" = "run_experiments" ] || [ "$1" = "run_experiments.sh" ] || [ "$1" = "run_all_models.sh" ] || [ "$1" = "run_analysis" ] || [ "$1" = "run_analysis.sh" ]; then exec "$@" fi @@ -216,7 +221,7 @@ while [[ $# -gt 0 ]]; do case $1 in -e|--env) if [ -z "$2" ] || [[ "$2" == -* ]]; then - echo "Error: -e/--env requires an environment name (boltz, protenix, rf3)" + echo "Error: -e/--env requires an environment name (boltz, protenix, rf3, analysis)" exit 1 fi ENV="$2" @@ -236,7 +241,7 @@ done # Validate environment if [[ -z "$ENV" ]]; then - echo "Error: Environment not specified. Use -e where env is boltz, protenix, or rf3" + echo "Error: Environment not specified. Use -e where env is boltz, protenix, rf3, or analysis" echo "" echo "Usage: docker run pixi-with-checkpoints -e