diff --git a/.actlignore b/.actlignore new file mode 100644 index 00000000..047b4971 --- /dev/null +++ b/.actlignore @@ -0,0 +1,13 @@ +# Keep ACTL sync focused on source. Large data/results should live under +# /mnt/diffuse-shared or the pod home PVC, not in the synced checkout. +.pixi/ +grid_search_results/ +outputs/ +data/ +initial_dataset_40*/ +checkpoints/ +release_data/ +*.ckpt +*.pt +*.tar.gz +*.tgz diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d5123f31..9e5350c4 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,7 +1,7 @@ name: Build and Push Docker Images -# CI builds pull all model checkpoints (~10 GB) from Docker Hub automatically via: -# COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ +# CI builds pull all model checkpoints (~10 GB) from Harbor automatically via: +# COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ # No checkpoint files are needed on the CI runner. The checkpoints base image is # pre-built and pushed from the GPU server. See Dockerfile comments for details. @@ -13,8 +13,8 @@ on: workflow_dispatch: env: - DOCKERHUB_ORG: diffuseproject - IMAGE_NAME: sampleworks + REGISTRY: harbor.astera.sh + IMAGE_NAME: library/pixi-with-checkpoints jobs: build: @@ -39,24 +39,25 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 - - name: Login to Docker Hub + - name: Login to Harbor uses: docker/login-action@v4 with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} + registry: ${{ env.REGISTRY }} + username: ${{ secrets.HARBOR_USERNAME }} + password: ${{ secrets.HARBOR_PASSWORD }} - # The Dockerfile uses COPY --from=diffuseproject/sampleworks-checkpoints:latest - # which Docker automatically pulls from Docker Hub during the build. + # The Dockerfile uses COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest + # which Docker automatically pulls from Harbor during the build. # No checkpoint files are needed in the CI build context. - name: Docker metadata id: meta uses: docker/metadata-action@v6 with: - images: ${{ env.DOCKERHUB_ORG }}/${{ env.IMAGE_NAME }} + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | - type=raw,value=latest - type=sha,prefix= + type=raw,value=sampleworks + type=sha,prefix=sha- type=semver,pattern={{version}} type=semver,pattern=v{{version}} - name: Build and push Docker image diff --git a/AGENTS.md b/AGENTS.md index b56d51d6..7e1b9436 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -154,6 +154,7 @@ src/sampleworks/ ├── metrics/ # Quality metrics (LDDT, sidechain) ├── eval/ # Evaluation utilities ├── data/ # Reference data (protein configs) +├── runs/ # `sampleworks-runs` CLI + TOML preset orchestrator └── utils/ # Shared utilities ``` diff --git a/Dockerfile b/Dockerfile index a51f8651..439421e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,10 +5,10 @@ # Checkpoints are baked into the image at /checkpoints/ via a pre-built base image. # # Build: -# docker build -t sampleworks . +# docker build -t pixi-with-checkpoints . # -# CI builds pull checkpoints automatically from Docker Hub via: -# COPY --from=diffuseproject/sampleworks-checkpoints:latest +# CI builds pull checkpoints automatically from Harbor via: +# COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest # No checkpoint files are needed in the build context or on the CI runner. # # To rebuild the checkpoints base image (only needed when checkpoints change): @@ -16,10 +16,10 @@ # # Run examples: # # Show help -# docker run sampleworks --help +# docker run pixi-with-checkpoints --help # # # Run grid search with Boltz1 (checkpoint baked in) -# docker run --gpus all -v /data:/data sampleworks \ +# docker run --gpus all -v /data:/data pixi-with-checkpoints \ # -e boltz run_grid_search.py \ # --proteins /data/proteins.csv \ # --models boltz1 \ @@ -33,7 +33,7 @@ # --align-to-input # # # Run grid search with Boltz2 (checkpoint baked in) -# docker run --gpus all -v /data:/data sampleworks \ +# docker run --gpus all -v /data:/data pixi-with-checkpoints \ # -e boltz run_grid_search.py \ # --proteins /data/proteins.csv \ # --models boltz2 \ @@ -45,7 +45,7 @@ # --use-tweedie # # # Interactive shell -# docker run --gpus all -it sampleworks bash +# docker run --gpus all -it pixi-with-checkpoints bash # # Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest): # /checkpoints/boltz1_conf.ckpt - Boltz1 model (~3.5GB) @@ -56,7 +56,7 @@ # /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model (~1.4GB) # # Checkpoints base image: -# All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub. +# All checkpoints live in harbor.astera.sh/library/sampleworks-checkpoints:latest. # To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server. # ============================================================================ @@ -97,6 +97,7 @@ WORKDIR /app # Copy all project files - needed because sampleworks is installed as editable package # The pypi-dependencies section has: sampleworks = {editable = true, path = "."} COPY pyproject.toml pixi.lock ./ +COPY experiments/ ./experiments/ COPY src/ ./src/ COPY scripts/ ./scripts/ COPY run_grid_search.py ./ @@ -104,11 +105,11 @@ COPY docker-entrypoint.sh /usr/local/bin/entrypoint.sh RUN chmod +x /usr/local/bin/entrypoint.sh # ============================================================================ -# Bake in model checkpoints from pre-built base image on Docker Hub +# Bake in model checkpoints from pre-built Harbor image # ============================================================================ # Checkpoints (~10 GB) rarely change, so this layer is placed before pixi # installs to stay cached even when dependencies update. -COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ +COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ # ============================================================================ # Install all three environments: boltz, protenix, rf3 @@ -129,6 +130,19 @@ RUN pixi run -e boltz python -c "\ from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)" +# This image carries pixi environments and checkpoints. Runtime source should +# come from ACTL's synced checkout at /home/dev/workspace, not from stale code +# baked into /app during image construction. +RUN rm -rf /app/src /app/scripts /app/experiments /app/run_grid_search.py \ + && mkdir -p /home/dev/workspace + +COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/ +RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n cd /home/dev/workspace\nfi\n' >> /root/.bashrc + +ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app \ + SAMPLEWORKS_APP_DIR= \ + SAMPLEWORKS_REQUIRE_PREBUILT_PIXI=1 + # Set default checkpoint paths via environment variables ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \ BOLTZ2_CHECKPOINT=/checkpoints/boltz2_conf.ckpt \ diff --git a/GRID_SEARCH.md b/GRID_SEARCH.md index 154d00c5..b8f9aa73 100644 --- a/GRID_SEARCH.md +++ b/GRID_SEARCH.md @@ -6,14 +6,14 @@ and how to find and read logs if you need to debug the process. ## Optional: Setting up the docker container It is often useful to have a docker container with all the dependencies installed. -Our script `run_all_models.sh` for instance uses a docker container to manage all +Our script `run_experiments` for instance uses a docker container to manage all dependencies. To run that script, you will need to have docker installed. Build the container with ```shell -docker build -t diffuseproject/sampleworks . +docker build -t pixi-with-checkpoints . ``` which will add an image to your local docker repository called -`diffuseproject/sampleworks:latest`. The top of the `Dockerfile` contains +`pixi-with-checkpoints:latest`. The top of the `Dockerfile` contains instructions on how to use the container as well. The container entrypoint (`docker-entrypoint`) is fairly generic and is used to call the `run_grid_search.py` script described below. diff --git a/README.md b/README.md index 0b123355..f9f31aa8 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,100 @@ Output layout: `grid_search_results//[_]//ens Instructions for running evaluation and metrics scripts are coming soon. +## Running preset experiments on ACTL (`run_experiments`) + +This section is Astera-specific: it assumes access to ACTL, the internal Harbor +image registry, and the `diffuse-shared` PVC. External users can run the same +TOML presets with `sampleworks-runs` or `python -m sampleworks.runs.cli` after +setting equivalent local paths for `DATA_DIR`, `PROTEINS_CSV`, `RESULTS_DIR`, +`MSA_CACHE_DIR`, and model checkpoints. + +Start an 8-GPU ACTL machine named `sampleworks` with the Sampleworks image and +the shared data volume mounted: + +```bash +actl pod up sampleworks --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes +``` + +Keep that terminal open; it maintains sync and SSH. From another terminal: + +```bash +actl pod status sampleworks +# copy the `ssh:` line, then run it, for example: +ssh workspace.actl-ws--sampleworks.devspace +cd /home/dev/workspace +``` + +The main command is `run_experiments`. It reads TOML presets and launches the +right `run_grid_search.py` jobs, pixi environments, GPU assignments, logs, +results directory, and MSA cache. + +```bash +export DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps +export PROTEINS_CSV="$DATA_DIR/proteins.csv" +export SAMPLEWORKS_ACTL_RUN_NAME="$(hostname -s)" + +run_experiments --list # show available presets (does not require DATA_DIR) +run_experiments --show rf3 # inspect what will run +run_experiments --dry-run rf3 # print commands without running +run_experiments rf3 # run the standalone RF3 preset +run_experiments boltz # run Boltz2 X-ray + Boltz2 MD +run_experiments boltz1 # run standalone Boltz1 +run_experiments protenix # run the standalone Protenix preset +run_experiments full_8gpu # run the full 8-GPU comparison preset +``` + +The default `full_8gpu` preset runs Boltz2 XRD, Boltz2 MD, RF3, and Protenix in +parallel. Run a subset with: + +```bash +run_experiments full_8gpu --jobs rf3,protenix +``` + +Standalone presets are available for each model/model family: `boltz`, +`boltz1`, `boltz2`, `boltz2_xrd`, `boltz2_md`, `rf3`, and `protenix`. +Additional comparison presets include `protenix_dual`, `rf3_protenix`, and RF3 +variants. Single-job presets default to `gpu_count = 8`, so on an 8-GPU pod +they use the whole machine. + +Presets live in `experiments/*.toml` in your local checkout and on the pod at +`/home/dev/workspace/experiments/*.toml`. To modify an experiment, edit or copy +a preset locally, let ACTL sync it, then run it by name or path: + +```bash +cp experiments/rf3_partial.toml experiments/my_rf3.toml +# edit experiments/my_rf3.toml locally +run_experiments --preset my_rf3 +``` + +For one-off changes, use `--set` instead of editing TOML: + +```bash +run_experiments rf3 --set jobs.rf3.gpu_count=4 +run_experiments rf3 --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02" +``` + +Presets usually declare `gpu_count = N`, not fixed GPU IDs. The runner assigns +visible GPUs automatically in job order, so the same preset works on different +pod sizes and fails fast if the pod has fewer visible GPUs than requested. Use +explicit `gpus = "0,1"` only when you need to pin a job to specific devices; the +runner validates those IDs before launching jobs. + +Set `DATA_DIR` and `PROTEINS_CSV` explicitly for each run so they are captured in +the shell history and launch logs. Checkpoints default to +`/mnt/diffuse-shared/raw/checkpoints` when those files exist, results go to +`/mnt/diffuse-shared/results/sampleworks///`, and MSA caches go to +`/mnt/diffuse-shared/cache/sampleworks/msa`. Override with `RESULTS_DIR`, +`MSA_CACHE_DIR`, or model-specific checkpoint variables before running. + +The ACTL image contains baked pixi environments under `/app/.pixi`. If your +synced branch changes `pyproject.toml` or `pixi.lock`, `run_experiments` stops +with a clear error instead of mutating the baked environment. For dependency +debugging only, opt into an on-pod pixi update with +`RUNTIME_PIXI=1 run_experiments ...`; reproducible scientist runs should use a +rebuilt `pixi-with-checkpoints:sampleworks` image instead. + + ## Docker TODO: Docker container documentation diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 957c351d..0477a7dc 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -2,15 +2,15 @@ # Sampleworks Docker Entrypoint # # Usage: -# docker run sampleworks -e