diff --git a/Dockerfile b/Dockerfile index ec8a9823..cacbd13b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -102,7 +102,8 @@ RUN --mount=type=cache,target=/root/.cache/pixi \ --mount=type=cache,target=/root/.cache/uv \ pixi install -e boltz --frozen && \ pixi install -e protenix --frozen && \ - pixi install -e rf3 --frozen + pixi install -e rf3 --frozen && \ + pixi install -e analysis --frozen # A GPU is not required to build the image. Pre-compile CUDA extensions only when # the builder exposes NVIDIA devices; if present, failures should stop the build. @@ -114,6 +115,16 @@ from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dil print('CUDA extensions compiled successfully')"; \ fi +# This image carries pixi environments and checkpoints. Runtime source should +# come from ACTL's synced checkout at /home/dev/workspace, not from stale code +# baked into /app during image construction. +RUN rm -rf /app/src /app/scripts /app/experiments /app/analyses \ + /app/run_grid_search.py /app/run_analysis \ + && mkdir -p /home/dev/workspace + +COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh run_analysis run_analysis.sh /usr/local/bin/ +RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n cd /home/dev/workspace\nfi\n' >> /root/.bashrc + # ============================================================================ # Public runtime: regular Sampleworks image for the public registry # ============================================================================ diff --git a/README.md b/README.md index 04deb915..0949a887 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,8 @@ Output layout: `grid_search_results//[_]//ens > **Note**: Jobs are skipped if a `refined.cif` file already exists in the output directory. Some flags (e.g., `--use-tweedie`, `--gradient-normalization`) are not reflected in the directory structure, so changing them alone won't trigger a re-run. Use `--force-all` to re-run all jobs regardless. This is under active development and will likely change soon. -Instructions for running evaluation and metrics scripts are coming soon. +Evaluation and metrics scripts can be run through `run_analysis`; see the ACTL +section below and `scripts/eval/EVALUATION.md`. ## Running preset experiments on ACTL (`run_experiments`) @@ -246,6 +247,39 @@ debugging only, opt into an on-pod pixi update with rebuilt `pixi-with-checkpoints:sampleworks` image instead. +## Running preset analyses on ACTL (`run_analysis`) + +`run_analysis` uses the same TOML runner as `run_experiments`, but loads presets +from `analyses/*.toml` and runs the scripts under `scripts/eval/`. +The `analyze_grid_search`, `all`, and `external_tools` presets first run a sequential +`patch_outputs` pre-job, which creates `refined-patched.cif` files from each +`refined.cif` before the evaluation jobs start. + +```bash +export GRID_SEARCH_RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks//full_8gpu +export GRID_SEARCH_INPUTS_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps +export PROTEIN_CONFIGS_CSV="$GRID_SEARCH_INPUTS_DIR/protein_analysis_config.csv" + +run_analysis --list +run_analysis --dry-run analyze_grid_search --jobs rscc +run_analysis analyze_grid_search --jobs rscc,lddt +run_analysis altloc_find +run_analysis altloc_classify +run_analysis all # includes tortoize and phenix.clashscore jobs +``` + +Use `--set` for one-off changes, for example +`run_analysis analyze_grid_search --jobs rscc --set jobs.rscc.gpus=0`. If your +input layout differs from the default +`processed/{pdb_id}/{pdb_id}_single_001_density_input.cif`, override the patch +pre-job with `--set defaults.PATCH_INPUT_PDB_PATTERN='{pdb_id}/{pdb_id}_original.cif'`. +When patched CIFs already exist, add `--skip-pre-jobs` to rerun analyses without +repeating the patching step. +The `altloc_find` and `altloc_classify` presets are independent of grid-search +outputs; override `ALTLOC_ANALYSIS_DIR` and `ALTLOC_INPUTS_DIR` when their input +or output roots differ from the defaults. + + ## Docker Sampleworks now has a two-layer image split: diff --git a/analyses/all.toml b/analyses/all.toml new file mode 100644 index 00000000..b286aaa2 --- /dev/null +++ b/analyses/all.toml @@ -0,0 +1,71 @@ +description = "All grid-search evaluations, including optional external-tool checks." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" +PATCH_CIF_PATTERN = "refined.cif" +GRID_SEARCH_DEPTH = "4" +PATCH_DEPTH = "${GRID_SEARCH_DEPTH}" +PATCH_INPUT_PDB_PATTERN = "processed/{pdb_id}/{pdb_id}_single_001_density_input.cif" +PATCH_RCSB_PATTERN = "${GRID_SEARCH_RESULTS_DIR}/([A-Za-z0-9]{4})" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" +depth = "${GRID_SEARCH_DEPTH}" + +[[pre_jobs]] +name = "patch_outputs" +env = "analysis" +gpus = "none" +script = "scripts/patch_output_cif_files.py" +output_arg = "" +output_subdir = "analysis/patch_outputs" +args = { input-dir = "${GRID_SEARCH_RESULTS_DIR}", cif-pattern = "${PATCH_CIF_PATTERN}", rcsb-pattern = "${PATCH_RCSB_PATTERN}", depth = "${PATCH_DEPTH}", grid-search-input-dir = "${GRID_SEARCH_INPUTS_DIR}", input-pdb-pattern = "${PATCH_INPUT_PDB_PATTERN}" } + +[[jobs]] +name = "rscc" +env = "analysis" +gpu_count = 1 +script = "scripts/eval/rscc_grid_search_script.py" +output_arg = "" +output_subdir = "analysis/rscc" + +[[jobs]] +name = "lddt" +env = "analysis" +gpus = "none" +script = "scripts/eval/lddt_evaluation_script.py" +output_arg = "" +output_subdir = "analysis/lddt" + +[[jobs]] +name = "bond_geometry" +env = "analysis" +gpus = "none" +script = "scripts/eval/bond_geometry_eval.py" +output_arg = "" +output_subdir = "analysis/bond_geometry" + +[[jobs]] +name = "tortoize" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_tortoize.py" +output_arg = "" +output_subdir = "analysis/tortoize" + +[[jobs]] +name = "phenix_clashscore" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_phenix_clashscore.py" +output_arg = "" +output_subdir = "analysis/phenix_clashscore" diff --git a/analyses/altloc_classify.toml b/analyses/altloc_classify.toml new file mode 100644 index 00000000..152bd33c --- /dev/null +++ b/analyses/altloc_classify.toml @@ -0,0 +1,25 @@ +description = "Classify altloc selections into side-chain, loop, and domain-shift categories." + +[defaults] +ALTLOC_ANALYSIS_DIR = "/data/results/altloc_analysis" +ALTLOC_INPUTS_DIR = "/data/inputs" +ALTLOC_SELECTIONS_CSV = "${ALTLOC_ANALYSIS_DIR}/altloc_selections.csv" +ALTLOC_CLASSIFICATIONS_CSV = "${ALTLOC_ANALYSIS_DIR}/altloc_region_classifications.csv" +CIF_ROOT = "${ALTLOC_INPUTS_DIR}" +DOMAIN_SHIFT_MIN_SPAN = "50" +LOOP_LDDT_THRESHOLD = "0.75" + +[shared_args] +input-csv = "${ALTLOC_SELECTIONS_CSV}" +cif-root = "${CIF_ROOT}" +output-file = "${ALTLOC_CLASSIFICATIONS_CSV}" +domain-shift-min-span = "${DOMAIN_SHIFT_MIN_SPAN}" +loop-lddt-threshold = "${LOOP_LDDT_THRESHOLD}" + +[[jobs]] +name = "classify_altloc_regions" +env = "analysis" +gpus = "none" +script = "scripts/eval/classify_altloc_regions.py" +output_arg = "" +output_subdir = "analysis/altloc_classify" diff --git a/analyses/altloc_find.toml b/analyses/altloc_find.toml new file mode 100644 index 00000000..898e5301 --- /dev/null +++ b/analyses/altloc_find.toml @@ -0,0 +1,23 @@ +description = "Build an analysis protein-config CSV by finding altloc selections in input CIFs." + +[defaults] +ALTLOC_ANALYSIS_DIR = "/data/results/altloc_analysis" +ALTLOC_INPUTS_DIR = "/data/inputs" +PROTEINS_CSV = "${ALTLOC_INPUTS_DIR}/proteins.csv" +ALTLOC_SELECTIONS_CSV = "${ALTLOC_ANALYSIS_DIR}/altloc_selections.csv" +ALTLOC_MIN_SPAN = "5" +ALTLOC_LABEL = "label_alt_id" + +[shared_args] +input-csv = "${PROTEINS_CSV}" +output-file = "${ALTLOC_SELECTIONS_CSV}" +min-span = "${ALTLOC_MIN_SPAN}" +altloc-label = "${ALTLOC_LABEL}" + +[[jobs]] +name = "find_altloc_selections" +env = "analysis" +gpus = "none" +script = "scripts/eval/find_altloc_selections.py" +output_arg = "" +output_subdir = "analysis/altloc_find" diff --git a/analyses/analyze_grid_search.toml b/analyses/analyze_grid_search.toml new file mode 100644 index 00000000..334779ae --- /dev/null +++ b/analyses/analyze_grid_search.toml @@ -0,0 +1,55 @@ +description = "Analyze grid-search outputs with RSCC, LDDT clustering, and bond geometry." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" +PATCH_CIF_PATTERN = "refined.cif" +GRID_SEARCH_DEPTH = "4" +PATCH_DEPTH = "${GRID_SEARCH_DEPTH}" +PATCH_INPUT_PDB_PATTERN = "processed/{pdb_id}/{pdb_id}_single_001_density_input.cif" +PATCH_RCSB_PATTERN = "${GRID_SEARCH_RESULTS_DIR}/([A-Za-z0-9]{4})" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" +depth = "${GRID_SEARCH_DEPTH}" + +[[pre_jobs]] +name = "patch_outputs" +env = "analysis" +gpus = "none" +script = "scripts/patch_output_cif_files.py" +output_arg = "" +output_subdir = "analysis/patch_outputs" +args = { input-dir = "${GRID_SEARCH_RESULTS_DIR}", cif-pattern = "${PATCH_CIF_PATTERN}", rcsb-pattern = "${PATCH_RCSB_PATTERN}", depth = "${PATCH_DEPTH}", grid-search-input-dir = "${GRID_SEARCH_INPUTS_DIR}", input-pdb-pattern = "${PATCH_INPUT_PDB_PATTERN}" } + +[[jobs]] +name = "rscc" +env = "analysis" +gpu_count = 1 +script = "scripts/eval/rscc_grid_search_script.py" +output_arg = "" +output_subdir = "analysis/rscc" + +[[jobs]] +name = "lddt" +env = "analysis" +gpus = "none" +script = "scripts/eval/lddt_evaluation_script.py" +output_arg = "" +output_subdir = "analysis/lddt" + +[[jobs]] +name = "bond_geometry" +env = "analysis" +gpus = "none" +script = "scripts/eval/bond_geometry_eval.py" +output_arg = "" +output_subdir = "analysis/bond_geometry" diff --git a/analyses/external_tools.toml b/analyses/external_tools.toml new file mode 100644 index 00000000..b6678c43 --- /dev/null +++ b/analyses/external_tools.toml @@ -0,0 +1,47 @@ +description = "Evaluation jobs that require external executables: tortoize and phenix.clashscore." + +[defaults] +GRID_SEARCH_RESULTS_DIR = "/data/results/grid_search_results" +GRID_SEARCH_INPUTS_DIR = "/data/inputs" +PROTEIN_CONFIGS_CSV = "${GRID_SEARCH_INPUTS_DIR}/protein_analysis_config.csv" +TARGET_FILENAME = "refined-patched.cif" +N_JOBS = "16" +PATCH_CIF_PATTERN = "refined.cif" +GRID_SEARCH_DEPTH = "4" +PATCH_DEPTH = "${GRID_SEARCH_DEPTH}" +PATCH_INPUT_PDB_PATTERN = "processed/{pdb_id}/{pdb_id}_single_001_density_input.cif" +PATCH_RCSB_PATTERN = "${GRID_SEARCH_RESULTS_DIR}/([A-Za-z0-9]{4})" + +[shared_args] +grid-search-results-path = "${GRID_SEARCH_RESULTS_DIR}" +grid-search-inputs-path = "${GRID_SEARCH_INPUTS_DIR}" +protein-configs-csv = "${PROTEIN_CONFIGS_CSV}" +target-filename = "${TARGET_FILENAME}" +occupancies = [0.0, 0.25, 0.5, 0.75, 1.0] +n-jobs = "${N_JOBS}" +depth = "${GRID_SEARCH_DEPTH}" + +[[pre_jobs]] +name = "patch_outputs" +env = "analysis" +gpus = "none" +script = "scripts/patch_output_cif_files.py" +output_arg = "" +output_subdir = "analysis/patch_outputs" +args = { input-dir = "${GRID_SEARCH_RESULTS_DIR}", cif-pattern = "${PATCH_CIF_PATTERN}", rcsb-pattern = "${PATCH_RCSB_PATTERN}", depth = "${PATCH_DEPTH}", grid-search-input-dir = "${GRID_SEARCH_INPUTS_DIR}", input-pdb-pattern = "${PATCH_INPUT_PDB_PATTERN}" } + +[[jobs]] +name = "tortoize" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_tortoize.py" +output_arg = "" +output_subdir = "analysis/tortoize" + +[[jobs]] +name = "phenix_clashscore" +env = "analysis" +gpus = "none" +script = "scripts/eval/run_and_process_phenix_clashscore.py" +output_arg = "" +output_subdir = "analysis/phenix_clashscore" diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 0c1c79e5..ac6e1a83 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -6,7 +6,7 @@ # docker run pixi-with-checkpoints -e boltz run_grid_search.py --proteins /data/proteins.csv ... # docker run pixi-with-checkpoints bash # interactive shell # -# Available pixi environments: boltz, protenix, rf3 +# Available pixi environments: boltz, protenix, rf3, analysis # # Examples: # # Run grid search with RF3 @@ -36,7 +36,7 @@ IMPORTANT: Always use --shm-size=16g (or larger) to avoid shared memory errors with DataLoaders. OPTIONS: - -e, --env Pixi environment to use (boltz, protenix, rf3) + -e, --env Pixi environment to use (boltz, protenix, rf3, analysis) -h, --help Show this help message bash Start an interactive shell @@ -44,6 +44,7 @@ ENVIRONMENTS: boltz For boltz1 and boltz2 models protenix For protenix model rf3 For RF3 model + analysis For scripts/eval analysis jobs EXAMPLES: # Run grid search with RF3 model @@ -190,6 +191,10 @@ PROTEINS CSV FORMAT: 1abc,/data/structures/1abc.cif,/data/maps/1abc.ccp4,2.0 2xyz,/data/structures/2xyz.cif,/data/maps/2xyz.mrc,1.8 +ACTL helper commands: + run_experiments Run experiments/*.toml presets + run_analysis Run analyses/*.toml presets for scripts/eval + For full argument details, run: docker run pixi-with-checkpoints -e boltz run_grid_search.py --help EOF @@ -202,7 +207,7 @@ if [ $# -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then fi # Handle scientist workflow helpers and interactive shells -if [ "$1" = "run_experiments" ] || [ "$1" = "run_experiments.sh" ] || [ "$1" = "run_all_models.sh" ]; then +if [ "$1" = "run_experiments" ] || [ "$1" = "run_experiments.sh" ] || [ "$1" = "run_all_models.sh" ] || [ "$1" = "run_analysis" ] || [ "$1" = "run_analysis.sh" ]; then exec "$@" fi @@ -216,7 +221,7 @@ while [[ $# -gt 0 ]]; do case $1 in -e|--env) if [ -z "$2" ] || [[ "$2" == -* ]]; then - echo "Error: -e/--env requires an environment name (boltz, protenix, rf3)" + echo "Error: -e/--env requires an environment name (boltz, protenix, rf3, analysis)" exit 1 fi ENV="$2" @@ -236,7 +241,7 @@ done # Validate environment if [[ -z "$ENV" ]]; then - echo "Error: Environment not specified. Use -e where env is boltz, protenix, or rf3" + echo "Error: Environment not specified. Use -e where env is boltz, protenix, rf3, or analysis" echo "" echo "Usage: docker run pixi-with-checkpoints -e