diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index ce5ef754..fb8c855b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -41,6 +41,9 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} + # The Dockerfile uses COPY --from=diffuseproject/sampleworks-checkpoints:latest + # which Docker automatically pulls from Docker Hub during the build. + # No checkpoint files are needed in the CI build context. - name: Build and push Docker image id: build-push uses: docker/build-push-action@v5 diff --git a/Dockerfile b/Dockerfile index a2322e56..20a3d675 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ # Sampleworks - Protein structure prediction with diffusion model guidance # # This container includes all three model environments: boltz, protenix, rf3 -# Checkpoints are baked into the image at /checkpoints/ +# Checkpoints are baked into the image at /checkpoints/ via a pre-built base image. # # Build: # docker build -t sampleworks . @@ -48,12 +48,16 @@ # docker run --gpus all -it sampleworks bash # # Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest): -# /checkpoints/boltz1_conf.ckpt - Boltz1 model -# /checkpoints/boltz2_conf.ckpt - Boltz2 model -# /checkpoints/ccd.pkl - Chemical Component Dictionary (required for Boltz) -# /checkpoints/mols/ - Boltz2 molecules data -# /checkpoints/rf3_foundry_01_24_latest.ckpt - RF3 model -# /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model +# /checkpoints/boltz1_conf.ckpt - Boltz1 model (~3.5GB) +# /checkpoints/boltz2_conf.ckpt - Boltz2 model (~2.3GB) +# /checkpoints/ccd.pkl - Chemical Component Dictionary (~345MB) +# /checkpoints/mols/ - Boltz2 molecule data (~2GB) +# /checkpoints/rf3_foundry_01_24_latest.ckpt - RF3 model (~2.9GB) +# /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model (~1.4GB) +# +# Checkpoints base image: +# All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub. +# To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server. # ============================================================================ # Base stage: CUDA + Pixi + common system dependencies @@ -116,10 +120,13 @@ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-comp # ============================================================================ # Bake in model checkpoints from pre-built base image on Docker Hub -# This image contains: boltz1, boltz2, ccd, mols/, rf3, protenix checkpoints +# ============================================================================ +# All checkpoints (Boltz1, Boltz2, CCD, mols, RF3, Protenix) are pre-built +# into diffuseproject/sampleworks-checkpoints:latest on Docker Hub. +# This avoids downloading ~6GB from HuggingFace during build and removes the +# need to have RF3/Protenix checkpoints in the build context. # Rebuild with: docker build -t diffuseproject/sampleworks-checkpoints:latest # docker push diffuseproject/sampleworks-checkpoints:latest -# ============================================================================ COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ # Set default checkpoint paths via environment variables diff --git a/run_all_models.sh b/run_all_models.sh index b7788a4f..c958a7cf 100755 --- a/run_all_models.sh +++ b/run_all_models.sh @@ -1,120 +1,156 @@ #!/bin/bash -# Run all model grid searches in parallel: Boltz1, Boltz2, Protenix, and RF3 -# Total: 16 GPUs used (4 jobs x 4 GPUs each) +# Run all 4 model grid searches in parallel, 2 GPUs each +# Total: 8 GPUs used (4 jobs x 2 GPUs each) # -# Checkpoints are BAKED INTO the Docker image - no need to mount them! +# Models: +# - Boltz2 X-ray diffraction (GPUs 0,1) +# - Boltz2 MD (GPUs 2,3) +# - RosettaFold3 (GPUs 4,5) +# - Protenix (GPUs 6,7) +# +# Checkpoints are BAKED INTO the Docker image at /checkpoints/. +# If missing, the code auto-falls back to mounted paths. # # Usage: # ./run_all_models.sh set -e -# Configuration - uses absolute path to data -DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40" -RESULTS_DIR="${RESULTS_DIR:-$HOME/sampleworks-exp/grid_search_results}" -# Docker image to use (override with IMAGE env var) -IMAGE="${IMAGE:-diffuseproject/sampleworks:latest}" +# Configuration +DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}" +MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}" -# Create output directory +# Create directories mkdir -p "$RESULTS_DIR" +mkdir -p "$MSA_CACHE_DIR" + +# Pull latest image (no-op if already up to date) +echo "Pulling latest Docker image..." +docker pull diffuseproject/sampleworks:latest # Common docker options DOCKER_OPTS="--rm --shm-size=16g" echo "==========================================" -echo "Starting all model grid searches" -echo "Models: boltz1, boltz2, protenix, rf3" +echo "Starting all model grid searches (4 jobs x 2 GPUs)" echo "Data: $DATA_DIR" echo "Results: $RESULTS_DIR" -echo "Image: $IMAGE" -echo "Checkpoints: BAKED INTO IMAGE" +echo "MSA Cache: $MSA_CACHE_DIR" +echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)" +echo "" +echo "Models:" +echo " - Boltz2 X-ray (GPUs 0,1)" +echo " - Boltz2 MD (GPUs 2,3)" +echo " - RF3 (GPUs 4,5)" +echo " - Protenix (GPUs 6,7)" echo "==========================================" -# Track background job PIDs -declare -a PIDS=() -declare -a PID_NAMES=() - -# Function to run a model with specific GPUs -# Usage: run_model [extra_args...] -run_model() { - local model=$1 - local env=$2 - local gpus=$3 - shift 3 - local extra_args=("$@") - - echo "[$(date)] Starting $model on GPUs $gpus" - - docker run $DOCKER_OPTS \ - --gpus "\"device=$gpus\"" \ - -v /mnt/diffuse-private:/mnt/diffuse-private:ro \ - -v "$RESULTS_DIR:/data/results" \ - "$IMAGE" \ - -e "$env" run_grid_search.py \ - --proteins "$DATA_DIR/proteins.csv" \ - --models "$model" \ - --scalers "pure_guidance" \ - --ensemble-sizes "1 4" \ - --gradient-weights "0.1 0.2" \ - --gradient-normalization --augmentation --align-to-input \ - --use-tweedie \ - --output-dir /data/results \ - "${extra_args[@]}" \ - 2>&1 | tee "$RESULTS_DIR/${model}_run.log" & - - PIDS+=($!) - PID_NAMES+=("$model") - echo "[$(date)] $model job started (PID: $!)" -} - -# Run all four models in parallel with 4 GPUs each: -# - boltz1: GPUs 0,1,2,3 -# - boltz2: GPUs 4,5,6,7 -# - protenix: GPUs 8,9,10,11 -# - rf3: GPUs 12,13,14,15 - -# Boltz1 (GPUs 0-3) - checkpoints baked in, uses defaults -run_model "boltz1" "boltz" "0,1,2,3" - -# Boltz2 (GPUs 4-7) - needs --methods flag -run_model "boltz2" "boltz" "4,5,6,7" --methods "X-RAY DIFFRACTION" - -# Protenix (GPUs 8-11) -run_model "protenix" "protenix" "8,9,10,11" - -# RF3 (GPUs 12-15) -run_model "rf3" "rf3" "12,13,14,15" +PIDS=() + +# --- Boltz2 X-ray Diffraction (GPUs 0,1) --- +echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1" +docker run $DOCKER_OPTS \ + --gpus '"device=0,1"' \ + -v "$DATA_DIR:/data/inputs:ro" \ + -v "$RESULTS_DIR:/data/results" \ + -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ + diffuseproject/sampleworks:latest \ + -e boltz run_grid_search.py \ + --proteins "/data/inputs/proteins.csv" \ + --models boltz2 \ + --methods "X-RAY DIFFRACTION" \ + --scalers pure_guidance \ + --partial-diffusion-step 120 \ + --ensemble-sizes "8" \ + --gradient-weights "0.1 0.2 0.5" \ + --gradient-normalization --augmentation --align-to-input \ + --output-dir /data/results \ + 2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" & +PIDS+=($!) +echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})" + +# --- Boltz2 MD (GPUs 2,3) --- +echo "[$(date)] Starting Boltz2 MD on GPUs 2,3" +docker run $DOCKER_OPTS \ + --gpus '"device=2,3"' \ + -v "$DATA_DIR:/data/inputs:ro" \ + -v "$RESULTS_DIR:/data/results" \ + -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ + diffuseproject/sampleworks:latest \ + -e boltz run_grid_search.py \ + --proteins "/data/inputs/proteins.csv" \ + --models boltz2 \ + --methods "MD" \ + --scalers pure_guidance \ + --partial-diffusion-step 120 \ + --ensemble-sizes "8" \ + --gradient-weights "0.1 0.2 0.5" \ + --gradient-normalization --augmentation --align-to-input \ + --output-dir /data/results \ + 2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" & +PIDS+=($!) +echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})" + +# --- RosettaFold3 (GPUs 4,5) --- +echo "[$(date)] Starting RosettaFold3 on GPUs 4,5" +docker run $DOCKER_OPTS \ + --gpus '"device=4,5"' \ + -v "$DATA_DIR:/data/inputs:ro" \ + -v "$RESULTS_DIR:/data/results" \ + -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ + diffuseproject/sampleworks:latest \ + -e rf3 run_grid_search.py \ + --proteins "/data/inputs/proteins.csv" \ + --models rf3 \ + --partial-diffusion-step 120 \ + --scalers pure_guidance \ + --ensemble-sizes "8" \ + --gradient-weights "0.01 0.02 0.05" \ + --gradient-normalization --augmentation --align-to-input \ + --output-dir /data/results \ + 2>&1 | tee "$RESULTS_DIR/rf3_run.log" & +PIDS+=($!) +echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})" + +# --- Protenix (GPUs 6,7) --- +echo "[$(date)] Starting Protenix on GPUs 6,7" +docker run $DOCKER_OPTS \ + --gpus '"device=6,7"' \ + -v "$DATA_DIR:/data/inputs:ro" \ + -v "$RESULTS_DIR:/data/results" \ + -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ + diffuseproject/sampleworks:latest \ + -e protenix run_grid_search.py \ + --proteins "/data/inputs/proteins.csv" \ + --models protenix \ + --scalers pure_guidance \ + --partial-diffusion-step 120 \ + --ensemble-sizes "8" \ + --gradient-weights "0.1 0.2 0.5" \ + --gradient-normalization --augmentation --align-to-input \ + --output-dir /data/results \ + 2>&1 | tee "$RESULTS_DIR/protenix_run.log" & +PIDS+=($!) +echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})" echo "" echo "==========================================" -echo "All model jobs launched!" +echo "All 4 jobs launched! PIDs: ${PIDS[*]}" echo "Logs:" -echo " - $RESULTS_DIR/boltz1_run.log" -echo " - $RESULTS_DIR/boltz2_run.log" -echo " - $RESULTS_DIR/protenix_run.log" +echo " - $RESULTS_DIR/boltz2_xrd_run.log" +echo " - $RESULTS_DIR/boltz2_md_run.log" echo " - $RESULTS_DIR/rf3_run.log" +echo " - $RESULTS_DIR/protenix_run.log" echo "" echo "Monitor GPU usage: nvidia-smi -l 1" echo "Waiting for all jobs to complete..." echo "==========================================" -# Wait for all background jobs and check exit codes -overall_exit=0 -for i in "${!PIDS[@]}"; do - if wait "${PIDS[$i]}"; then - echo "[$(date)] ${PID_NAMES[$i]} completed successfully" - else - echo "[$(date)] ${PID_NAMES[$i]} FAILED (exit code: $?)" - overall_exit=1 - fi -done +# Wait for all background jobs +wait echo "" echo "==========================================" -if [ $overall_exit -eq 0 ]; then - echo "[$(date)] All jobs completed successfully!" -else - echo "[$(date)] Some jobs FAILED — check logs above" -fi +echo "[$(date)] All jobs completed!" echo "==========================================" -exit $overall_exit diff --git a/src/sampleworks/utils/guidance_script_arguments.py b/src/sampleworks/utils/guidance_script_arguments.py index f88a1f66..ebcef29a 100644 --- a/src/sampleworks/utils/guidance_script_arguments.py +++ b/src/sampleworks/utils/guidance_script_arguments.py @@ -8,6 +8,51 @@ from sampleworks.utils.guidance_constants import GuidanceType, StructurePredictor +# Baked-in checkpoint paths (Docker image) with legacy fallbacks +_CHECKPOINT_CANDIDATES = { + "boltz1": ["/checkpoints/boltz1_conf.ckpt", "~/.boltz/boltz1_conf.ckpt"], + "boltz2": ["/checkpoints/boltz2_conf.ckpt", "~/.boltz/boltz2_conf.ckpt"], + "rf3": [ + "/checkpoints/rf3_foundry_01_24_latest.ckpt", + "~/.foundry/checkpoints/rf3_foundry_01_24_latest.ckpt", + ], + "protenix": [ + "/checkpoints/protenix_base_default_v0.5.0.pt", + ".pixi/envs/protenix-dev/lib/python3.12/site-packages/release_data/checkpoint/protenix_base_default_v0.5.0.pt", + ], +} + + +def _resolve_checkpoint(model_key: str) -> str: + """Return the first checkpoint path that exists on disk for *model_key*. + + Tries baked-in Docker paths first (``/checkpoints/``), then falls back to + legacy development paths. If none are found the first candidate is returned + so that downstream validation produces a clear error message. + """ + candidates = _CHECKPOINT_CANDIDATES.get(model_key, []) + for candidate in candidates: + resolved = Path(candidate).expanduser() + if resolved.exists(): + return str(resolved) + # Nothing found – return the primary (baked-in) path so the error message + # points the user to the expected location. + resolved = candidates[0] if candidates else "" + if not resolved: + raise ValueError( + f"Running guidance requires a model checkpoint for '{model_key}'. " + f"Provide --model-checkpoint or bake checkpoints into /checkpoints/." + ) + if not Path(resolved).exists(): + raise ValueError( + f"Model checkpoint '{resolved}' does not exist. " + f"Provide a valid path via --model-checkpoint." + ) + + return resolved + + + def get_checkpoint(args: argparse.Namespace) -> str | None: """Resolve a model checkpoint path from an argparse namespace. @@ -27,6 +72,10 @@ def validate_model_checkpoint( ) -> str: """Validate and normalize the checkpoint path for ``model``. + When *checkpoint* is ``None`` (no ``--model-checkpoint`` provided), the + function auto-resolves by checking baked-in Docker paths first + (``/checkpoints/``) and then legacy development paths. + Returns ------- str @@ -35,12 +84,20 @@ def validate_model_checkpoint( Raises ------ ValueError - If checkpoint is missing/empty or points to a directory. + If checkpoint points to a directory. FileNotFoundError - If checkpoint does not exist. + If checkpoint does not exist on disk. """ + # Auto-resolve when no explicit checkpoint was provided if checkpoint is None or str(checkpoint).strip() == "": - raise ValueError(f"Missing checkpoint for model '{model}'. Provide --model-checkpoint.") + model_key = str(model).lower().replace("structurepredictor.", "") + resolved = _resolve_checkpoint(model_key) + if not resolved: + raise ValueError( + f"Missing checkpoint for model '{model}'. " + f"Provide --model-checkpoint or bake checkpoints into /checkpoints/." + ) + checkpoint = resolved checkpoint_path = Path(str(checkpoint)).expanduser().resolve() @@ -111,6 +168,10 @@ def populate_config_for_guidance_type(self, job: JobConfig, args: argparse.Names checkpoint = get_checkpoint(args) if checkpoint is not None: self.model_checkpoint = checkpoint + elif not getattr(self, "model_checkpoint", None): + # Auto-resolve from baked-in /checkpoints/ or legacy fallback paths + model_key = str(self.model).lower().replace("structurepredictor.", "") + self.model_checkpoint = _resolve_checkpoint(model_key) if job.model == StructurePredictor.BOLTZ_2 and job.method: self.method = job.method @@ -237,8 +298,8 @@ def add_boltz2_specific_args(parser: argparse.ArgumentParser | GuidanceConfig): parser.add_argument( "--model-checkpoint", type=str, - default="~/.boltz/boltz2_conf.ckpt", - help="Path to Boltz2 checkpoint", + default=None, + help="Path to Boltz2 checkpoint (default: auto-resolved from /checkpoints/ or ~/.boltz/)", ) parser.add_argument( "--method", @@ -252,8 +313,8 @@ def add_protenix_specific_args(parser: argparse.ArgumentParser | GuidanceConfig) parser.add_argument( "--model-checkpoint", type=str, - default=".pixi/envs/protenix-dev/lib/python3.12/site-packages/release_data/checkpoint/protenix_base_default_v0.5.0.pt", - help="Path to Protenix checkpoint directory", + default=None, + help="Path to Protenix checkpoint (default: auto-resolved from /checkpoints/ or pixi env)", ) @@ -261,8 +322,8 @@ def add_boltz1_specific_args(parser: argparse.ArgumentParser | GuidanceConfig): parser.add_argument( "--model-checkpoint", type=str, - default="~/.boltz/boltz1_conf.ckpt", - help="Path to Boltz1 checkpoint", + default=None, + help="Path to Boltz1 checkpoint (default: auto-resolved from /checkpoints/ or ~/.boltz/)", ) @@ -270,8 +331,8 @@ def add_rf3_specific_args(parser: argparse.ArgumentParser | GuidanceConfig): parser.add_argument( "--model-checkpoint", type=str, - default="~/.foundry/checkpoints/rf3_foundry_01_24_latest.ckpt", - help="Path to RF3 checkpoint", + default=None, + help="Path to RF3 checkpoint (default: auto-resolved from /checkpoints/ or ~/.foundry/)", ) parser.add_argument( "--msa-path", diff --git a/src/sampleworks/utils/guidance_script_utils.py b/src/sampleworks/utils/guidance_script_utils.py index 12063257..505f6e94 100644 --- a/src/sampleworks/utils/guidance_script_utils.py +++ b/src/sampleworks/utils/guidance_script_utils.py @@ -36,6 +36,7 @@ from sampleworks.utils.guidance_script_arguments import ( GuidanceConfig, JobResult, + _resolve_checkpoint, validate_model_checkpoint, ) from sampleworks.utils.msa import MSAManager @@ -563,7 +564,13 @@ def run_guidance_job_queue(job_queue_path: str) -> list[JobResult]: template_job = job_queue[0] if template_job.model_checkpoint is None or template_job.model_checkpoint == "": - raise ValueError("Running guidance requires that you specify a model checkpoint") + # Auto-resolve from baked-in /checkpoints/ or legacy fallback paths + model_key = str(template_job.model).lower().replace("structurepredictor.", "") + resolved = _resolve_checkpoint(model_key) # will raise if not found + template_job.model_checkpoint = resolved + # Propagate to all jobs in the queue + for job in job_queue: + job.model_checkpoint = resolved logger.info(f"Running {len(job_queue)} jobs, using {template_job} as a setup template") device, model_wrapper = get_model_and_device(