From 38542dafc275e52d7b1dff70be5515d6e636a9bc Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 18:38:08 -0700
Subject: [PATCH 01/28] feat(runs): add sampleworks-runs preset orchestrator

Adds a new `sampleworks-runs` CLI under `src/sampleworks/runs/` that
launches parallel `run_grid_search.py` jobs from a single TOML preset.
Five bundled presets (`all_models`, `rf3_partial`, `rf3_partial_chiral_off`,
`protenix_dual`, `rf3_protenix`) cover the canonical multi-model sweeps.

Each preset declares its jobs (pixi env, GPU assignment, args); the runner
sets `CUDA_VISIBLE_DEVICES`, shells out via `pixi run -e <env>`, tees per-job
logs, and aggregates exit codes. Dotted-path `--set` overrides let users
sweep parameters without editing TOML, e.g.:

  sampleworks-runs rf3_partial --set jobs.rf3.args.gradient-weights="0.0 0.01"

Pure stdlib (tomllib + dataclasses + argparse + subprocess), no new deps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                     |   1 +
 README.md                                     |  26 +++
 pyproject.toml                                |   4 +
 src/sampleworks/runs/__init__.py              |   5 +
 src/sampleworks/runs/cli.py                   | 122 ++++++++++++
 src/sampleworks/runs/loader.py                | 178 ++++++++++++++++++
 src/sampleworks/runs/presets/all_models.toml  |  44 +++++
 .../runs/presets/protenix_dual.toml           |  34 ++++
 src/sampleworks/runs/presets/rf3_partial.toml |  24 +++
 .../runs/presets/rf3_partial_chiral_off.toml  |  26 +++
 .../runs/presets/rf3_protenix.toml            |  29 +++
 src/sampleworks/runs/runner.py                | 146 ++++++++++++++
 src/sampleworks/runs/schema.py                |  60 ++++++
 tests/runs/__init__.py                        |   0
 tests/runs/test_cli.py                        |  88 +++++++++
 tests/runs/test_loader.py                     | 142 ++++++++++++++
 tests/runs/test_runner.py                     | 111 +++++++++++
 17 files changed, 1040 insertions(+)
 create mode 100644 src/sampleworks/runs/__init__.py
 create mode 100644 src/sampleworks/runs/cli.py
 create mode 100644 src/sampleworks/runs/loader.py
 create mode 100644 src/sampleworks/runs/presets/all_models.toml
 create mode 100644 src/sampleworks/runs/presets/protenix_dual.toml
 create mode 100644 src/sampleworks/runs/presets/rf3_partial.toml
 create mode 100644 src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
 create mode 100644 src/sampleworks/runs/presets/rf3_protenix.toml
 create mode 100644 src/sampleworks/runs/runner.py
 create mode 100644 src/sampleworks/runs/schema.py
 create mode 100644 tests/runs/__init__.py
 create mode 100644 tests/runs/test_cli.py
 create mode 100644 tests/runs/test_loader.py
 create mode 100644 tests/runs/test_runner.py
diff --git a/AGENTS.md b/AGENTS.md
index b56d51d6..7e1b9436 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -154,6 +154,7 @@ src/sampleworks/
 ├── metrics/               # Quality metrics (LDDT, sidechain)
 ├── eval/                  # Evaluation utilities
 ├── data/                  # Reference data (protein configs)
+├── runs/                  # `sampleworks-runs` CLI + TOML preset orchestrator
 └── utils/                 # Shared utilities
 ```
 
diff --git a/README.md b/README.md
index 0b123355..efdcc537 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,32 @@ Output layout: `grid_search_results/<protein>/<model>[_<method>]/<scaler>/ens<N>
 Instructions for running evaluation and metrics scripts are coming soon.
 
 
+## Preset experiments (`sampleworks-runs`)
+
+For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes.
+
+```bash
+pixi run -e rf3 sampleworks-runs --list                          # bundled presets
+pixi run -e rf3 sampleworks-runs rf3_partial                     # run a preset
+pixi run -e rf3 sampleworks-runs rf3_partial --show              # inspect resolved values
+pixi run -e rf3 sampleworks-runs rf3_partial --dry-run           # print pixi run commands, don't execute
+pixi run -e rf3 sampleworks-runs all_models --only rf3,protenix  # subset jobs
+
+# Override any value without editing the TOML:
+pixi run -e rf3 sampleworks-runs rf3_partial \
+    --set jobs.rf3.gpus=7 \
+    --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
+```
+
+Bundled presets live in `src/sampleworks/runs/presets/*.toml`. Add a new preset by dropping a `.toml` file alongside them or pointing at any path:
+
+```bash
+sampleworks-runs ./my_experiment.toml
+```
+
+Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block.
+
+
 ## Docker
 
 TODO: Docker container documentation
diff --git a/pyproject.toml b/pyproject.toml
index 613a784b..5b97b9fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,10 @@ version = "0.6.2"
 
 [project.scripts]
 sampleworks-guidance = "sampleworks.cli.guidance:main"
+sampleworks-runs = "sampleworks.runs.cli:main"
+
+[tool.hatch.build.targets.wheel.force-include]
+"src/sampleworks/runs/presets" = "sampleworks/runs/presets"
 
 [tool.hatch.metadata]
 allow-direct-references = true
diff --git a/src/sampleworks/runs/__init__.py b/src/sampleworks/runs/__init__.py
new file mode 100644
index 00000000..8bb071ac
--- /dev/null
+++ b/src/sampleworks/runs/__init__.py
@@ -0,0 +1,5 @@
+"""Preset-driven orchestrator for parallel run_grid_search.py invocations.
+
+Replaces the previous ACTL-native bash wrapper scripts with TOML presets +
+a thin Python runner. See ``sampleworks-runs --help``.
+"""
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
new file mode 100644
index 00000000..5a76f4e2
--- /dev/null
+++ b/src/sampleworks/runs/cli.py
@@ -0,0 +1,122 @@
+"""Command-line entry point for ``sampleworks-runs``."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+from . import loader, runner
+from .schema import Preset
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    if args.list:
+        for name in loader.list_bundled_presets():
+            print(name)
+        return 0
+
+    if args.preset is None:
+        parser.error("PRESET is required (or pass --list)")
+
+    preset = loader.load_preset(args.preset, overrides=args.set)
+    if args.only:
+        preset = _filter_only(preset, args.only)
+
+    if args.show:
+        _print_show(preset)
+        return 0
+
+    results_dir = Path(args.results_dir or _default_results_dir(preset))
+    return runner.run(preset, results_dir=results_dir, dry_run=args.dry_run)
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="sampleworks-runs",
+        description=(
+            "Run a preset of parallel run_grid_search.py jobs. "
+            "Presets are TOML files bundled under sampleworks.runs.presets, "
+            "or pass a path to a .toml file directly."
+        ),
+    )
+    parser.add_argument("preset", nargs="?", help="Bundled preset name or path to a .toml file")
+    parser.add_argument("--list", action="store_true", help="List bundled presets and exit")
+    parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print the pixi run commands instead of executing them",
+    )
+    parser.add_argument(
+        "--only",
+        default="",
+        help="Comma-separated job names to run (subset). Default: all jobs.",
+    )
+    parser.add_argument(
+        "--set",
+        action="append",
+        default=[],
+        metavar="DOTTED_KEY=VALUE",
+        help=(
+            "Override a value in the loaded preset. Examples: "
+            "--set defaults.DATA_DIR=/data/foo, "
+            "--set jobs.rf3.args.gradient-weights='0.0 0.01', "
+            "--set jobs.0.gpus=5"
+        ),
+    )
+    parser.add_argument(
+        "--results-dir",
+        default=None,
+        help="Override RESULTS_DIR for this run (also controls per-job log location).",
+    )
+    return parser
+
+
+def _filter_only(preset: Preset, only: str) -> Preset:
+    names = [n.strip() for n in only.split(",") if n.strip()]
+    keep = [j for j in preset.jobs if j.name in names]
+    missing = set(names) - {j.name for j in keep}
+    if missing:
+        raise SystemExit(f"--only references unknown jobs: {sorted(missing)}")
+    return Preset(
+        name=preset.name,
+        description=preset.description,
+        defaults=preset.defaults,
+        jobs=keep,
+    )
+
+
+def _print_show(preset: Preset) -> None:
+    print(f"name: {preset.name}")
+    if preset.description:
+        print(f"description: {preset.description}")
+    if preset.defaults:
+        print("defaults:")
+        for k, v in preset.defaults.items():
+            print(f"  {k} = {v}")
+    print("jobs:")
+    for j in preset.jobs:
+        print(f"  - name: {j.name}")
+        print(f"    env: {j.env}")
+        print(f"    gpus: {j.gpus}")
+        print(f"    output_subdir: {j.output_subdir}")
+        print("    args:")
+        for k, v in j.args.items():
+            print(f"      {k} = {v!r}")
+
+
+def _default_results_dir(preset: Preset) -> str:
+    return (
+        preset.defaults.get("RESULTS_DIR")
+        or os.environ.get("RESULTS_DIR")
+        or "./grid_search_results"
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
new file mode 100644
index 00000000..328b4155
--- /dev/null
+++ b/src/sampleworks/runs/loader.py
@@ -0,0 +1,178 @@
+"""Load presets from TOML and apply runtime overrides.
+
+Resolution order for every string value (defaults block and ``args``):
+  1. ``${VAR}`` references are resolved against the process environment,
+     with the preset's ``[defaults]`` block filling in any unset keys.
+  2. ``--set <dotted-path>=<value>`` CLI overrides are applied last.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import tomllib
+from collections.abc import Iterable
+from importlib import resources
+from pathlib import Path
+from typing import Any
+
+from .schema import Job, Preset
+
+_BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets"
+_VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
+
+
+def list_bundled_presets() -> list[str]:
+    """Return the names (sans ``.toml``) of bundled presets, sorted."""
+    files = resources.files(_BUNDLED_PRESETS_PACKAGE)
+    return sorted(p.name.removesuffix(".toml") for p in files.iterdir() if p.name.endswith(".toml"))
+
+
+def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
+    """Load a preset by bundled name or filesystem path, applying ``--set`` overrides."""
+    raw = _read_toml(name_or_path)
+    overrides_list = list(overrides)
+    raw = _apply_overrides(raw, overrides_list)
+    raw = _resolve_variables(raw)
+    return _build_preset(name=_preset_name(name_or_path), raw=raw)
+
+
+def _read_toml(name_or_path: str) -> dict[str, Any]:
+    path = Path(name_or_path)
+    if path.suffix == ".toml" and path.exists():
+        return tomllib.loads(path.read_text())
+    bundled = resources.files(_BUNDLED_PRESETS_PACKAGE) / f"{name_or_path}.toml"
+    if not bundled.is_file():
+        raise FileNotFoundError(
+            f"No preset {name_or_path!r}. Bundled: {list_bundled_presets()}. "
+            f"Or pass a path to a .toml file."
+        )
+    return tomllib.loads(bundled.read_text())
+
+
+def _preset_name(name_or_path: str) -> str:
+    return Path(name_or_path).stem if name_or_path.endswith(".toml") else name_or_path
+
+
+def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any]:
+    for spec in overrides:
+        if "=" not in spec:
+            raise ValueError(f"--set expects KEY=VALUE, got {spec!r}")
+        key, value = spec.split("=", 1)
+        _set_dotted(raw, key.strip(), _coerce(value))
+    return raw
+
+
+def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
+    """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``."""
+    parts = dotted.split(".")
+    cursor: Any = obj
+    for i, part in enumerate(parts[:-1]):
+        cursor = _index(cursor, part, where=".".join(parts[: i + 1]))
+    leaf_parent = cursor
+    leaf_key = parts[-1]
+    if isinstance(leaf_parent, list):
+        leaf_parent[_find_in_list(leaf_parent, leaf_key, where=dotted)] = value
+    else:
+        leaf_parent[leaf_key] = value
+
+
+def _index(cursor: Any, part: str, *, where: str) -> Any:
+    if isinstance(cursor, list):
+        return cursor[_find_in_list(cursor, part, where=where)]
+    if isinstance(cursor, dict):
+        if part not in cursor:
+            cursor[part] = {}
+        return cursor[part]
+    raise TypeError(f"Cannot descend into {type(cursor).__name__} at {where!r}")
+
+
+def _find_in_list(items: list[Any], key: str, *, where: str) -> int:
+    if key.isdigit() or (key.startswith("-") and key[1:].isdigit()):
+        return int(key)
+    for i, item in enumerate(items):
+        if isinstance(item, dict) and item.get("name") == key:
+            return i
+    raise KeyError(f"No list element named {key!r} at {where!r}")
+
+
+def _coerce(value: str) -> Any:
+    if value.lower() in ("true", "false"):
+        return value.lower() == "true"
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        pass
+    return value
+
+
+def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]:
+    """Expand ``${VAR}`` in every string. Env wins; defaults block fills gaps.
+
+    Defaults are resolved in TOML order, so later defaults can reference earlier ones
+    (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``).
+    """
+    defaults: dict[str, str] = dict(raw.get("defaults", {}))
+    accumulated: dict[str, str] = dict(os.environ)
+    resolved_defaults: dict[str, str] = {}
+    for key, default_value in defaults.items():
+        if key in os.environ:
+            resolved_defaults[key] = os.environ[key]
+        else:
+            resolved_defaults[key] = _expand(default_value, accumulated)
+        accumulated[key] = resolved_defaults[key]
+    resolved = _walk(raw, accumulated)
+    resolved["defaults"] = resolved_defaults
+    return resolved
+
+
+def _walk(obj: Any, env: dict[str, str]) -> Any:
+    if isinstance(obj, dict):
+        return {k: _walk(v, env) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_walk(item, env) for item in obj]
+    if isinstance(obj, str):
+        return _expand(obj, env)
+    return obj
+
+
+def _expand(text: str, env: dict[str, str]) -> str:
+    def repl(match: re.Match[str]) -> str:
+        var = match.group(1)
+        if var not in env:
+            raise KeyError(f"Undefined variable ${{{var}}} in preset (no env var, no default)")
+        return env[var]
+
+    prev = None
+    current = text
+    while prev != current:
+        prev = current
+        current = _VAR_PATTERN.sub(repl, current)
+    return current
+
+
+def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
+    raw_jobs = raw.get("jobs", [])
+    if not isinstance(raw_jobs, list):
+        raise ValueError(f"Preset {name!r}: 'jobs' must be a list")
+    jobs = [
+        Job(
+            name=str(j["name"]),
+            env=str(j["env"]),
+            gpus=str(j["gpus"]),
+            output_subdir=str(j["output_subdir"]),
+            args=dict(j.get("args", {})),
+        )
+        for j in raw_jobs
+    ]
+    return Preset(
+        name=name,
+        description=str(raw.get("description", "")),
+        defaults=dict(raw.get("defaults", {})),
+        shared_args=dict(raw.get("shared_args", {})),
+        jobs=jobs,
+    )
diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml
new file mode 100644
index 00000000..21d1cd92
--- /dev/null
+++ b/src/sampleworks/runs/presets/all_models.toml
@@ -0,0 +1,44 @@
+description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)."
+
+[defaults]
+DATA_DIR = "/data/input"
+RESULTS_DIR = "/data/results"
+MSA_CACHE_DIR = "${HOME}/.sampleworks/msa"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz2_xrd"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz2_xrd"
+args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
+
+[[jobs]]
+name = "boltz2_md"
+env = "boltz"
+gpus = "2,3"
+output_subdir = "boltz2_md"
+args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
+
+[[jobs]]
+name = "rf3"
+env = "rf3"
+gpus = "4,5"
+output_subdir = "rf3"
+args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1" }
+
+[[jobs]]
+name = "protenix"
+env = "protenix"
+gpus = "6,7"
+output_subdir = "protenix"
+args = { model = "protenix", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml
new file mode 100644
index 00000000..461d547e
--- /dev/null
+++ b/src/sampleworks/runs/presets/protenix_dual.toml
@@ -0,0 +1,34 @@
+description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)."
+
+[defaults]
+DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results"
+MSA_CACHE_DIR = "/data/sampleworks-exp/msa_cache"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt"
+PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "protenix"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "protenix_tiny"
+env = "protenix"
+gpus = "2,3"
+output_subdir = "protenix_tiny"
+args = { model-checkpoint = "${PROTENIX_TINY_CHECKPOINT}" }
+
+[[jobs]]
+name = "protenix_mini"
+env = "protenix"
+gpus = "6,7"
+output_subdir = "protenix_mini"
+args = { model-checkpoint = "${PROTENIX_MINI_CHECKPOINT}" }
diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml
new file mode 100644
index 00000000..533accfb
--- /dev/null
+++ b/src/sampleworks/runs/presets/rf3_partial.toml
@@ -0,0 +1,24 @@
+description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)."
+
+[defaults]
+DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results"
+MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "rf3"
+env = "rf3"
+gpus = "4"
+output_subdir = "rf3"
+args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
new file mode 100644
index 00000000..c1f34820
--- /dev/null
+++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
@@ -0,0 +1,26 @@
+description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep."
+
+[defaults]
+DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results_rf3_chiral_off"
+MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+force-all = true
+disable-chiral-features = true
+
+[[jobs]]
+name = "rf3"
+env = "rf3"
+gpus = "5"
+output_subdir = "."
+args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml
new file mode 100644
index 00000000..9255cfea
--- /dev/null
+++ b/src/sampleworks/runs/presets/rf3_protenix.toml
@@ -0,0 +1,29 @@
+description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)."
+
+[defaults]
+DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results"
+MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "rf3"
+env = "rf3"
+gpus = "0,1,2,3"
+output_subdir = "rf3"
+args = { model = "rf3", gradient-weights = "0.0 0.01 0.02 0.05 0.1" }
+
+[[jobs]]
+name = "protenix"
+env = "protenix"
+gpus = "4,5,6,7"
+output_subdir = "protenix"
+args = { model = "protenix", partial-diffusion-step = 120, gradient-weights = "0.0 0.1 0.2 0.5" }
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
new file mode 100644
index 00000000..2623ba22
--- /dev/null
+++ b/src/sampleworks/runs/runner.py
@@ -0,0 +1,146 @@
+"""Build job argv and orchestrate parallel subprocess execution."""
+
+from __future__ import annotations
+
+import os
+import shlex
+import subprocess
+import sys
+import threading
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .schema import Job, Preset
+
+GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
+
+
+@dataclass
+class JobInvocation:
+    job: Job
+    argv: list[str]
+    env: dict[str, str]
+    log_path: Path
+
+
+def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]:
+    """Build the subprocess argv + env + log path for every job in the preset."""
+    invocations: list[JobInvocation] = []
+    for job in preset.jobs:
+        args = preset.effective_args(job)
+        args.setdefault("output-dir", str(results_dir / job.output_subdir))
+        argv = _build_argv(job.env, args)
+        env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus}
+        log_path = results_dir / f"{job.name}_run.log"
+        invocations.append(JobInvocation(job=job, argv=argv, env=env, log_path=log_path))
+    return invocations
+
+
+def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
+    argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT]
+    for key, value in args.items():
+        flag = f"--{key}"
+        if isinstance(value, bool):
+            if value:
+                argv.append(flag)
+        elif value is None:
+            continue
+        else:
+            argv.extend([flag, str(value)])
+    return argv
+
+
+def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
+    """Launch every job in parallel; tee output to per-job logs; return 0 iff all succeed."""
+    results_dir.mkdir(parents=True, exist_ok=True)
+    invocations = build_invocations(preset, results_dir=results_dir)
+
+    if dry_run:
+        for inv in invocations:
+            _print_dry_run(inv)
+        return 0
+
+    _print_launch_summary(preset, invocations)
+    processes = [_spawn(inv) for inv in invocations]
+    return _wait_all(processes)
+
+
+def _print_dry_run(inv: JobInvocation) -> None:
+    print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
+    print(f"# log: {inv.log_path}", file=sys.stderr)
+    print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}")
+    print(file=sys.stderr)
+
+
+def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> None:
+    bar = "=" * 60
+    print(bar, file=sys.stderr)
+    print(f"preset: {preset.name}", file=sys.stderr)
+    if preset.description:
+        print(f"  {preset.description}", file=sys.stderr)
+    for inv in invocations:
+        print(
+            f"  - {inv.job.name}: env={inv.job.env}, gpus={inv.job.gpus}, log={inv.log_path}",
+            file=sys.stderr,
+        )
+    print(bar, file=sys.stderr)
+
+
+@dataclass
+class _RunningJob:
+    inv: JobInvocation
+    proc: subprocess.Popen[bytes]
+    tee_thread: threading.Thread
+
+
+def _spawn(inv: JobInvocation) -> _RunningJob:
+    inv.log_path.parent.mkdir(parents=True, exist_ok=True)
+    log_file = open(inv.log_path, "wb")
+    proc = subprocess.Popen(
+        inv.argv,
+        env=inv.env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        bufsize=0,
+    )
+    assert proc.stdout is not None
+    thread = threading.Thread(
+        target=_tee,
+        args=(inv.job.name, proc.stdout, log_file),
+        daemon=True,
+    )
+    thread.start()
+    print(f"[{_ts()}] launched {inv.job.name} (pid {proc.pid})", file=sys.stderr)
+    return _RunningJob(inv=inv, proc=proc, tee_thread=thread)
+
+
+def _wait_all(jobs: list[_RunningJob]) -> int:
+    failures = 0
+    for j in jobs:
+        exit_code = j.proc.wait()
+        j.tee_thread.join()
+        if exit_code == 0:
+            print(f"[{_ts()}] {j.inv.job.name} succeeded", file=sys.stderr)
+        else:
+            print(f"[{_ts()}] {j.inv.job.name} FAILED (exit {exit_code})", file=sys.stderr)
+            failures += 1
+    return 0 if failures == 0 else 1
+
+
+def _tee(prefix: str, src: Any, dest: Any) -> None:
+    for line in iter(src.readline, b""):
+        dest.write(line)
+        dest.flush()
+        sys.stderr.write(f"[{prefix}] {line.decode('utf-8', errors='replace')}")
+        sys.stderr.flush()
+    dest.close()
+
+
+def _ts() -> str:
+    return time.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def _shell_join(argv: list[str]) -> str:
+    return shlex.join(argv)
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
new file mode 100644
index 00000000..216a6bc0
--- /dev/null
+++ b/src/sampleworks/runs/schema.py
@@ -0,0 +1,60 @@
+"""Dataclasses for the preset schema.
+
+A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
+is launched as ``pixi run -e <env> python /app/run_grid_search.py <args>`` with
+``CUDA_VISIBLE_DEVICES`` set to the job's GPU assignment.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+VALID_PIXI_ENVS = ("boltz", "protenix", "rf3")
+
+
+@dataclass
+class Job:
+    name: str
+    env: str
+    gpus: str
+    output_subdir: str
+    args: dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        if self.env not in VALID_PIXI_ENVS:
+            raise ValueError(
+                f"Job {self.name!r}: env must be one of {VALID_PIXI_ENVS}, got {self.env!r}"
+            )
+        if not self.gpus:
+            raise ValueError(f"Job {self.name!r}: gpus must be non-empty")
+        if not self.output_subdir:
+            raise ValueError(f"Job {self.name!r}: output_subdir must be non-empty")
+
+
+@dataclass
+class Preset:
+    name: str
+    description: str
+    defaults: dict[str, str] = field(default_factory=dict)
+    shared_args: dict[str, Any] = field(default_factory=dict)
+    jobs: list[Job] = field(default_factory=list)
+
+    def __post_init__(self) -> None:
+        if not self.jobs:
+            raise ValueError(f"Preset {self.name!r}: must declare at least one job")
+        seen: set[str] = set()
+        for job in self.jobs:
+            if job.name in seen:
+                raise ValueError(f"Preset {self.name!r}: duplicate job name {job.name!r}")
+            seen.add(job.name)
+
+    def job(self, name: str) -> Job:
+        for j in self.jobs:
+            if j.name == name:
+                return j
+        raise KeyError(f"Preset {self.name!r} has no job {name!r}")
+
+    def effective_args(self, job: Job) -> dict[str, Any]:
+        """Return ``shared_args`` merged with per-job overrides."""
+        return {**self.shared_args, **job.args}
diff --git a/tests/runs/__init__.py b/tests/runs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
new file mode 100644
index 00000000..c93afe2c
--- /dev/null
+++ b/tests/runs/test_cli.py
@@ -0,0 +1,88 @@
+"""End-to-end CLI tests (--list, --show, --dry-run, --only)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from sampleworks.runs import cli
+
+
+def test_list_prints_all_bundled_presets(capsys: pytest.CaptureFixture[str]) -> None:
+    exit_code = cli.main(["--list"])
+    assert exit_code == 0
+    out = capsys.readouterr().out.splitlines()
+    assert set(out) == {
+        "all_models",
+        "rf3_partial",
+        "rf3_partial_chiral_off",
+        "protenix_dual",
+        "rf3_protenix",
+    }
+
+
+def test_show_prints_resolved_preset(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(["rf3_partial", "--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: rf3_partial" in out
+    assert "gradient-weights" in out
+
+
+def test_dry_run_does_not_invoke_subprocess(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setenv("HOME", str(tmp_path))
+    exit_code = cli.main(
+        ["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)]
+    )
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "pixi run -e rf3 python /app/run_grid_search.py" in out
+    assert "CUDA_VISIBLE_DEVICES=4" in out
+
+
+def test_only_filters_to_subset(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(["all_models", "--only", "rf3,protenix", "--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: rf3" in out
+    assert "name: protenix" in out
+    assert "boltz2_xrd" not in out
+    assert "boltz2_md" not in out
+
+
+def test_only_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(SystemExit, match="unknown jobs"):
+        cli.main(["all_models", "--only", "nonexistent", "--show"])
+
+
+def test_set_override_propagates_through_cli(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(
+        [
+            "rf3_partial",
+            "--set",
+            "jobs.rf3.args.gradient-weights=0.0 0.01",
+            "--show",
+        ]
+    )
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "0.0 0.01" in out
+
+
+def test_no_preset_and_no_list_errors(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(SystemExit):
+        cli.main([])
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
new file mode 100644
index 00000000..66eeb5b9
--- /dev/null
+++ b/tests/runs/test_loader.py
@@ -0,0 +1,142 @@
+"""Unit tests for sampleworks.runs.loader."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+from sampleworks.runs import loader
+from sampleworks.runs.schema import Preset
+
+
+BUNDLED = ["all_models", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
+
+
+def test_list_bundled_presets_returns_the_five() -> None:
+    names = loader.list_bundled_presets()
+    assert set(names) == set(BUNDLED), f"unexpected bundled presets: {names}"
+
+
+@pytest.mark.parametrize("name", BUNDLED)
+def test_each_bundled_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset(name)
+    assert preset.name == name
+    assert preset.jobs, f"{name} has no jobs"
+    for job in preset.jobs:
+        assert job.env in ("boltz", "protenix", "rf3")
+
+
+def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    monkeypatch.setenv("DATA_DIR", "/from/env")
+    preset = loader.load_preset("rf3_partial")
+    assert preset.defaults["DATA_DIR"] == "/from/env"
+    rf3 = preset.job("rf3")
+    # PROTEINS_CSV expands to ${DATA_DIR}/proteins.csv; DATA_DIR overridden by env
+    proteins = preset.shared_args["proteins"]
+    assert proteins == "/from/env/proteins.csv"
+
+
+def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("DATA_DIR", raising=False)
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("rf3_partial")
+    assert preset.defaults["DATA_DIR"] == "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+
+
+def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("DATA_DIR", raising=False)
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("rf3_partial", overrides=["defaults.DATA_DIR=/custom"])
+    assert preset.defaults["DATA_DIR"] == "/custom"
+    assert preset.shared_args["proteins"] == "/custom/proteins.csv"
+
+
+def test_set_override_at_job_by_name(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("all_models", overrides=["jobs.rf3.gpus=7"])
+    assert preset.job("rf3").gpus == "7"
+
+
+def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("all_models", overrides=["jobs.0.gpus=9"])
+    assert preset.jobs[0].gpus == "9"
+
+
+def test_set_override_at_args_inside_job(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset(
+        "rf3_partial", overrides=["jobs.rf3.args.gradient-weights=0.0 0.01"]
+    )
+    assert preset.job("rf3").args["gradient-weights"] == "0.0 0.01"
+
+
+def test_set_coerces_bool_and_int(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset(
+        "rf3_partial",
+        overrides=[
+            "shared_args.gradient-normalization=false",
+            "jobs.rf3.args.partial-diffusion-step=200",
+        ],
+    )
+    assert preset.shared_args["gradient-normalization"] is False
+    # job.args["partial-diffusion-step"] doesn't exist by default in rf3_partial,
+    # but --set should still create or override it
+    assert preset.job("rf3").args["partial-diffusion-step"] == 200
+
+
+def test_load_preset_from_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    custom = tmp_path / "mycustom.toml"
+    custom.write_text(
+        'description = "custom"\n'
+        "[defaults]\n"
+        'DATA_DIR = "/x"\n'
+        "[shared_args]\n"
+        'model = "rf3"\n'
+        "[[jobs]]\n"
+        'name = "j1"\n'
+        'env = "rf3"\n'
+        'gpus = "0"\n'
+        'output_subdir = "j1"\n'
+        "args = {}\n"
+    )
+    preset = loader.load_preset(str(custom))
+    assert preset.name == "mycustom"
+    assert preset.defaults["DATA_DIR"] == "/x"
+
+
+def test_unknown_preset_raises() -> None:
+    with pytest.raises(FileNotFoundError):
+        loader.load_preset("does_not_exist")
+
+
+def test_undefined_variable_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    bad = tmp_path / "bad.toml"
+    bad.write_text(
+        '[shared_args]\nproteins = "${NEVER_DEFINED_VAR}/x"\n'
+        '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n'
+    )
+    monkeypatch.delenv("NEVER_DEFINED_VAR", raising=False)
+    with pytest.raises(KeyError, match="NEVER_DEFINED_VAR"):
+        loader.load_preset(str(bad))
+
+
+def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(ValueError, match="KEY=VALUE"):
+        loader.load_preset("rf3_partial", overrides=["bogus_no_equals"])
+
+
+def test_bad_env_rejected(tmp_path: Path) -> None:
+    bad = tmp_path / "bad.toml"
+    bad.write_text(
+        '[[jobs]]\nname = "j"\nenv = "not_a_real_env"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n'
+    )
+    with pytest.raises(ValueError, match="env must be one of"):
+        loader.load_preset(str(bad))
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
new file mode 100644
index 00000000..4ac31ab9
--- /dev/null
+++ b/tests/runs/test_runner.py
@@ -0,0 +1,111 @@
+"""Unit tests for sampleworks.runs.runner argv builder."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from sampleworks.runs import loader, runner
+
+
+def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Faithful translation: argv should match the canonical rf3_partial bash invocation."""
+    monkeypatch.setenv("HOME", "/home/test")
+    monkeypatch.delenv("DATA_DIR", raising=False)
+    monkeypatch.delenv("RESULTS_DIR", raising=False)
+    preset = loader.load_preset("rf3_partial")
+    invocations = runner.build_invocations(preset, results_dir=Path("/results"))
+
+    assert len(invocations) == 1
+    inv = invocations[0]
+    assert inv.job.name == "rf3"
+    assert inv.env["CUDA_VISIBLE_DEVICES"] == "4"
+    assert inv.log_path == Path("/results/rf3_run.log")
+
+    argv = inv.argv
+    assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"]
+    pairs = _argv_to_dict(argv[6:])
+    assert pairs["--proteins"] == (
+        "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv"
+    )
+    assert pairs["--model"] == "rf3"
+    assert pairs["--scalers"] == "pure_guidance"
+    assert pairs["--partial-diffusion-step"] == "120"
+    assert pairs["--ensemble-sizes"] == "8"
+    assert pairs["--gradient-weights"] == "0.0 0.005 0.01 0.02 0.035 0.05 0.1"
+    assert pairs["--model-checkpoint"] == "/checkpoints/rf3_foundry_01_24_latest.ckpt"
+    assert pairs["--output-dir"] == "/results/rf3"
+    # store_true flags appear as bare keys (value=True in our dict)
+    assert pairs["--gradient-normalization"] is True
+    assert pairs["--augmentation"] is True
+    assert pairs["--align-to-input"] is True
+
+
+def test_argv_omits_false_bool_flags(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset(
+        "rf3_partial", overrides=["shared_args.gradient-normalization=false"]
+    )
+    inv = runner.build_invocations(preset, results_dir=Path("/results"))[0]
+    assert "--gradient-normalization" not in inv.argv
+
+
+def test_explicit_output_dir_in_args_wins_over_subdir_default(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        "[shared_args]\n"
+        '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "sub"\n'
+        'args = { "output-dir" = "/explicit/path" }\n'
+    )
+    preset = loader.load_preset(str(custom))
+    inv = runner.build_invocations(preset, results_dir=Path("/results"))[0]
+    pairs = _argv_to_dict(inv.argv[6:])
+    assert pairs["--output-dir"] == "/explicit/path"
+
+
+def test_all_models_has_four_jobs_with_distinct_gpus(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("all_models")
+    invocations = runner.build_invocations(preset, results_dir=Path("/r"))
+    assert [i.job.name for i in invocations] == ["boltz2_xrd", "boltz2_md", "rf3", "protenix"]
+    gpu_assignments = [i.env["CUDA_VISIBLE_DEVICES"] for i in invocations]
+    assert gpu_assignments == ["0,1", "2,3", "4,5", "6,7"]
+
+
+def test_protenix_dual_uses_different_checkpoints(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("protenix_dual")
+    invocations = runner.build_invocations(preset, results_dir=Path("/r"))
+    pairs = [_argv_to_dict(i.argv[6:]) for i in invocations]
+    assert pairs[0]["--model-checkpoint"] == "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt"
+    assert pairs[1]["--model-checkpoint"] == "/extra_checkpoints/protenix_mini_default_v0.5.0.pt"
+
+
+def test_rf3_partial_chiral_off_flag_present(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("rf3_partial_chiral_off")
+    inv = runner.build_invocations(preset, results_dir=Path("/r"))[0]
+    assert "--disable-chiral-features" in inv.argv
+    assert "--force-all" in inv.argv
+
+
+def _argv_to_dict(tail: list[str]) -> dict[str, object]:
+    """Turn ``[--a, 1, --b, --c, 2]`` into ``{'--a': '1', '--b': True, '--c': '2'}``."""
+    out: dict[str, object] = {}
+    i = 0
+    while i < len(tail):
+        flag = tail[i]
+        assert flag.startswith("--"), f"unexpected positional: {flag}"
+        if i + 1 < len(tail) and not tail[i + 1].startswith("--"):
+            out[flag] = tail[i + 1]
+            i += 2
+        else:
+            out[flag] = True
+            i += 1
+    return out

From f25a71eea84b7a991b42f551caba8cce9f746767 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 21:06:19 -0700
Subject: [PATCH 02/28] style(runs): fix ruff lint findings

Ruff auto-fixed import ordering (I001) and unused imports (F401); two
remaining were a leftover unused local `rf3 = preset.job("rf3")` (F841)
and one over-100-char assertion (E501) extracted into a local.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sampleworks/runs/loader.py | 1 +
 src/sampleworks/runs/runner.py | 1 +
 src/sampleworks/runs/schema.py | 1 +
 tests/runs/test_cli.py         | 1 -
 tests/runs/test_loader.py      | 7 ++-----
 tests/runs/test_runner.py      | 1 -
 6 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index 328b4155..906abcca 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -18,6 +18,7 @@
 
 from .schema import Job, Preset
 
+
 _BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets"
 _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
 
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 2623ba22..87b918b0 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -14,6 +14,7 @@
 
 from .schema import Job, Preset
 
+
 GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
 
 
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index 216a6bc0..2f50a3e9 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass, field
 from typing import Any
 
+
 VALID_PIXI_ENVS = ("boltz", "protenix", "rf3")
 
 
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index c93afe2c..0e22cd5f 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 
 import pytest
-
 from sampleworks.runs import cli
 
 
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 66eeb5b9..c828b7c9 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -2,13 +2,10 @@
 
 from __future__ import annotations
 
-import os
 from pathlib import Path
 
 import pytest
-
 from sampleworks.runs import loader
-from sampleworks.runs.schema import Preset
 
 
 BUNDLED = ["all_models", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
@@ -34,7 +31,6 @@ def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> No
     monkeypatch.setenv("DATA_DIR", "/from/env")
     preset = loader.load_preset("rf3_partial")
     assert preset.defaults["DATA_DIR"] == "/from/env"
-    rf3 = preset.job("rf3")
     # PROTEINS_CSV expands to ${DATA_DIR}/proteins.csv; DATA_DIR overridden by env
     proteins = preset.shared_args["proteins"]
     assert proteins == "/from/env/proteins.csv"
@@ -44,7 +40,8 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial")
-    assert preset.defaults["DATA_DIR"] == "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+    expected = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+    assert preset.defaults["DATA_DIR"] == expected
 
 
 def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 4ac31ab9..050d7458 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 
 import pytest
-
 from sampleworks.runs import loader, runner
 
 

From c3f0d12fec2eeba9ba184a6a2e1d7662854ff751 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 21:11:46 -0700
Subject: [PATCH 03/28] style(runs): apply ruff format

CI's lint job runs both `ruff check` and `ruff format --check`. The
prior commit fixed only the former.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/runs/test_cli.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index 0e22cd5f..1023f0b3 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -36,9 +36,7 @@ def test_dry_run_does_not_invoke_subprocess(
     monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str]
 ) -> None:
     monkeypatch.setenv("HOME", str(tmp_path))
-    exit_code = cli.main(
-        ["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)]
-    )
+    exit_code = cli.main(["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)])
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "pixi run -e rf3 python /app/run_grid_search.py" in out

From d5d065a3ff8f3699e303238bcec2091cde057491 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 21:17:15 -0700
Subject: [PATCH 04/28] fix(runs): address CodeRabbit review findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- cli.py: `--only` filter now preserves `shared_args`; previously the
  filtered Preset dropped them, silently losing every job's shared flags.
- loader.py: `--set` now rejects unknown top-level keys. A typo like
  `--set job.rf3.gpus=0` (note missing 's') used to auto-create an unused
  `job` dict and silently no-op; now it raises KeyError with the valid
  keys listed.
- runner.py: handle partial spawn failures. If one job fails to spawn
  midway, already-launched jobs are terminated and joined instead of
  orphaned. Also wraps Popen in try/close to avoid log-file handle leak
  if the subprocess fails to start.
- presets/rf3_partial_chiral_off.toml: change `output_subdir = "."`
  (write to RESULTS_DIR root) to `"rf3"` (subdir under RESULTS_DIR) for
  consistency with the other RF3 presets and collision safety when
  RESULTS_DIR is overridden to a shared location.

Skipped (faithful to bash original): rf3_protenix.toml asymmetric
partial-diffusion-step — the source `run_rf3_protenix_mdc_actl.sh`
deliberately set it only for the Protenix job, not RF3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sampleworks/runs/cli.py                   |  1 +
 src/sampleworks/runs/loader.py                |  8 +++++
 .../runs/presets/rf3_partial_chiral_off.toml  |  2 +-
 src/sampleworks/runs/runner.py                | 36 ++++++++++++++-----
 tests/runs/test_loader.py                     |  7 ++++
 5 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 5a76f4e2..75b3ab2e 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -87,6 +87,7 @@ def _filter_only(preset: Preset, only: str) -> Preset:
         name=preset.name,
         description=preset.description,
         defaults=preset.defaults,
+        shared_args=preset.shared_args,
         jobs=keep,
     )
 
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index 906abcca..65be05ac 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -64,9 +64,17 @@ def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any
     return raw
 
 
+_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
+
+
 def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
     """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``."""
     parts = dotted.split(".")
+    if parts[0] not in _TOP_LEVEL_KEYS:
+        raise KeyError(
+            f"--set: unknown top-level key {parts[0]!r} in {dotted!r}. "
+            f"Valid top-level keys: {sorted(_TOP_LEVEL_KEYS)}"
+        )
     cursor: Any = obj
     for i, part in enumerate(parts[:-1]):
         cursor = _index(cursor, part, where=".".join(parts[: i + 1]))
diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
index c1f34820..bd3a1311 100644
--- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
+++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
@@ -22,5 +22,5 @@ disable-chiral-features = true
 name = "rf3"
 env = "rf3"
 gpus = "5"
-output_subdir = "."
+output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 87b918b0..4e273afa 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -64,10 +64,26 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
         return 0
 
     _print_launch_summary(preset, invocations)
-    processes = [_spawn(inv) for inv in invocations]
+    processes: list[_RunningJob] = []
+    try:
+        for inv in invocations:
+            processes.append(_spawn(inv))
+    except BaseException:
+        _terminate_all(processes)
+        raise
     return _wait_all(processes)
 
 
+def _terminate_all(jobs: list[_RunningJob]) -> None:
+    """Terminate any already-launched jobs (used when a later spawn fails)."""
+    for j in jobs:
+        if j.proc.poll() is None:
+            j.proc.terminate()
+    for j in jobs:
+        j.proc.wait()
+        j.tee_thread.join()
+
+
 def _print_dry_run(inv: JobInvocation) -> None:
     print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
     print(f"# log: {inv.log_path}", file=sys.stderr)
@@ -99,13 +115,17 @@ class _RunningJob:
 def _spawn(inv: JobInvocation) -> _RunningJob:
     inv.log_path.parent.mkdir(parents=True, exist_ok=True)
     log_file = open(inv.log_path, "wb")
-    proc = subprocess.Popen(
-        inv.argv,
-        env=inv.env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=0,
-    )
+    try:
+        proc = subprocess.Popen(
+            inv.argv,
+            env=inv.env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=0,
+        )
+    except BaseException:
+        log_file.close()
+        raise
     assert proc.stdout is not None
     thread = threading.Thread(
         target=_tee,
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index c828b7c9..d8d6509c 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -130,6 +130,13 @@ def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None:
         loader.load_preset("rf3_partial", overrides=["bogus_no_equals"])
 
 
+def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Typos like ``--set job.rf3.gpus=0`` (missing 's' in jobs) must not silently no-op."""
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(KeyError, match="unknown top-level key"):
+        loader.load_preset("rf3_partial", overrides=["job.rf3.gpus=0"])
+
+
 def test_bad_env_rejected(tmp_path: Path) -> None:
     bad = tmp_path / "bad.toml"
     bad.write_text(

From a89be9d2eea84bb971de5c2e0eea019decc5bb53 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 21:22:52 -0700
Subject: [PATCH 05/28] style(runs): NumPy-style docstrings + frozen
 dataclasses

Aligns the new sampleworks.runs module with project style policy
(AGENTS.md L7 and L338):

- Add NumPy-style docstrings (summary, Parameters, Returns, Raises) to
  every public and private function/class across schema.py, loader.py,
  runner.py, and cli.py.
- Mark Job, Preset, JobInvocation, and _RunningJob as
  @dataclass(frozen=True). Behavior is preserved: Preset.effective_args
  already returns a fresh dict, and the runner mutates that local copy
  rather than the Preset itself.

All 32 unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sampleworks/runs/cli.py    |  65 +++++++++
 src/sampleworks/runs/loader.py | 250 +++++++++++++++++++++++++++++++--
 src/sampleworks/runs/runner.py | 167 +++++++++++++++++++++-
 src/sampleworks/runs/schema.py |  89 +++++++++++-
 4 files changed, 553 insertions(+), 18 deletions(-)

diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 75b3ab2e..7cb64f93 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -12,6 +12,20 @@
 
 
 def main(argv: list[str] | None = None) -> int:
+    """Entry point for the ``sampleworks-runs`` console script.
+
+    Parameters
+    ----------
+    argv : list of str or None, optional
+        Command-line arguments excluding the program name. When ``None``
+        (the default), :mod:`argparse` reads from :data:`sys.argv`.
+
+    Returns
+    -------
+    int
+        Exit code suitable for ``sys.exit``: ``0`` on success, non-zero on
+        job failure or fatal CLI error.
+    """
     parser = _build_parser()
     args = parser.parse_args(argv)
 
@@ -36,6 +50,13 @@ def main(argv: list[str] | None = None) -> int:
 
 
 def _build_parser() -> argparse.ArgumentParser:
+    """Construct the :mod:`argparse` parser for ``sampleworks-runs``.
+
+    Returns
+    -------
+    argparse.ArgumentParser
+        Parser covering preset selection, overrides, and execution flags.
+    """
     parser = argparse.ArgumentParser(
         prog="sampleworks-runs",
         description=(
@@ -78,6 +99,26 @@ def _build_parser() -> argparse.ArgumentParser:
 
 
 def _filter_only(preset: Preset, only: str) -> Preset:
+    """Return a new :class:`Preset` containing only the named jobs.
+
+    Parameters
+    ----------
+    preset : Preset
+        Source preset.
+    only : str
+        Comma-separated list of job names to keep.
+
+    Returns
+    -------
+    Preset
+        New preset with the same ``description``, ``defaults``, and
+        ``shared_args`` and only the filtered jobs.
+
+    Raises
+    ------
+    SystemExit
+        If any name in ``only`` does not match a job in ``preset``.
+    """
     names = [n.strip() for n in only.split(",") if n.strip()]
     keep = [j for j in preset.jobs if j.name in names]
     missing = set(names) - {j.name for j in keep}
@@ -93,6 +134,13 @@ def _filter_only(preset: Preset, only: str) -> Preset:
 
 
 def _print_show(preset: Preset) -> None:
+    """Print a human-readable rendering of a resolved preset to stdout.
+
+    Parameters
+    ----------
+    preset : Preset
+        Resolved preset to display (used by ``--show``).
+    """
     print(f"name: {preset.name}")
     if preset.description:
         print(f"description: {preset.description}")
@@ -112,6 +160,23 @@ def _print_show(preset: Preset) -> None:
 
 
 def _default_results_dir(preset: Preset) -> str:
+    """Pick a sensible default ``--results-dir`` when none is given.
+
+    Order of preference:
+      1. The preset's ``[defaults]`` ``RESULTS_DIR``.
+      2. The ``RESULTS_DIR`` environment variable.
+      3. ``./grid_search_results``.
+
+    Parameters
+    ----------
+    preset : Preset
+        Resolved preset (its ``defaults`` have already been merged with env).
+
+    Returns
+    -------
+    str
+        Path to use as the run's root output directory.
+    """
     return (
         preset.defaults.get("RESULTS_DIR")
         or os.environ.get("RESULTS_DIR")
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index 65be05ac..ce8b130c 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -21,16 +21,50 @@
 
 _BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets"
 _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
+_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
 
 
 def list_bundled_presets() -> list[str]:
-    """Return the names (sans ``.toml``) of bundled presets, sorted."""
+    """List the names of all TOML presets shipped with the package.
+
+    Returns
+    -------
+    list of str
+        Preset names (filename stems, no ``.toml`` extension), sorted
+        alphabetically.
+    """
     files = resources.files(_BUNDLED_PRESETS_PACKAGE)
     return sorted(p.name.removesuffix(".toml") for p in files.iterdir() if p.name.endswith(".toml"))
 
 
 def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
-    """Load a preset by bundled name or filesystem path, applying ``--set`` overrides."""
+    """Load a preset by bundled name or filesystem path.
+
+    Parameters
+    ----------
+    name_or_path : str
+        Either the name of a bundled preset (as returned by
+        :func:`list_bundled_presets`) or a path ending in ``.toml``.
+    overrides : Iterable of str, optional
+        ``KEY=VALUE`` strings as accepted by ``--set``. Applied before
+        variable interpolation.
+
+    Returns
+    -------
+    Preset
+        Fully resolved preset ready for :func:`runner.run`.
+
+    Raises
+    ------
+    FileNotFoundError
+        If ``name_or_path`` matches no bundled preset and no file on disk.
+    KeyError
+        If an override path begins with an unknown top-level key, or if a
+        ``${VAR}`` reference cannot be resolved against the environment or
+        the ``[defaults]`` block.
+    ValueError
+        If an override is malformed (missing ``=``).
+    """
     raw = _read_toml(name_or_path)
     overrides_list = list(overrides)
     raw = _apply_overrides(raw, overrides_list)
@@ -39,6 +73,23 @@ def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
 
 
 def _read_toml(name_or_path: str) -> dict[str, Any]:
+    """Read raw TOML from a filesystem path or a bundled package resource.
+
+    Parameters
+    ----------
+    name_or_path : str
+        Bundled preset name or filesystem path ending in ``.toml``.
+
+    Returns
+    -------
+    dict of str to Any
+        Parsed TOML, before override application or interpolation.
+
+    Raises
+    ------
+    FileNotFoundError
+        If neither location yields a TOML file.
+    """
     path = Path(name_or_path)
     if path.suffix == ".toml" and path.exists():
         return tomllib.loads(path.read_text())
@@ -52,10 +103,44 @@ def _read_toml(name_or_path: str) -> dict[str, Any]:
 
 
 def _preset_name(name_or_path: str) -> str:
+    """Return the canonical preset name for a bundled name or path argument.
+
+    Parameters
+    ----------
+    name_or_path : str
+        Either a bundled name or a path ending in ``.toml``.
+
+    Returns
+    -------
+    str
+        Filename stem if ``name_or_path`` looks like a path; otherwise the
+        argument unchanged.
+    """
     return Path(name_or_path).stem if name_or_path.endswith(".toml") else name_or_path
 
 
 def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any]:
+    """Apply each ``KEY=VALUE`` override to the raw preset dict in place.
+
+    Parameters
+    ----------
+    raw : dict of str to Any
+        Parsed TOML to mutate.
+    overrides : list of str
+        Each entry must contain exactly one ``=``.
+
+    Returns
+    -------
+    dict of str to Any
+        The same ``raw`` dict (mutated).
+
+    Raises
+    ------
+    ValueError
+        If an override is missing the ``=`` separator.
+    KeyError
+        If an override's top-level key is unknown.
+    """
     for spec in overrides:
         if "=" not in spec:
             raise ValueError(f"--set expects KEY=VALUE, got {spec!r}")
@@ -64,11 +149,29 @@ def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any
     return raw
 
 
-_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
-
-
 def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
-    """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``."""
+    """Set a nested value in ``obj`` addressed by a dotted path.
+
+    Job-list elements can be addressed by job name or by integer index.
+
+    Parameters
+    ----------
+    obj : dict of str to Any
+        Root dict to mutate.
+    dotted : str
+        Dotted path, e.g. ``"jobs.rf3.args.gradient-weights"`` or
+        ``"defaults.DATA_DIR"``.
+    value : Any
+        Coerced value to write at the leaf.
+
+    Raises
+    ------
+    KeyError
+        If the first segment is not one of :data:`_TOP_LEVEL_KEYS`, or if a
+        list segment references a missing job name or index.
+    TypeError
+        If the path attempts to descend through a non-container value.
+    """
     parts = dotted.split(".")
     if parts[0] not in _TOP_LEVEL_KEYS:
         raise KeyError(
@@ -87,6 +190,27 @@ def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
 
 
 def _index(cursor: Any, part: str, *, where: str) -> Any:
+    """Descend one level into a dict or list, auto-creating empty intermediates.
+
+    Parameters
+    ----------
+    cursor : Any
+        Current node in the traversal.
+    part : str
+        Next segment of the dotted path.
+    where : str
+        Path so far, used in error messages.
+
+    Returns
+    -------
+    Any
+        The child node.
+
+    Raises
+    ------
+    TypeError
+        If ``cursor`` is neither a dict nor a list.
+    """
     if isinstance(cursor, list):
         return cursor[_find_in_list(cursor, part, where=where)]
     if isinstance(cursor, dict):
@@ -97,6 +221,28 @@ def _index(cursor: Any, part: str, *, where: str) -> Any:
 
 
 def _find_in_list(items: list[Any], key: str, *, where: str) -> int:
+    """Locate a list element by integer index or by ``name`` field.
+
+    Parameters
+    ----------
+    items : list of Any
+        List to search.
+    key : str
+        Numeric string (positive or negative index) or a name to match against
+        each element's ``"name"`` key.
+    where : str
+        Path so far, used in error messages.
+
+    Returns
+    -------
+    int
+        Index of the matching element.
+
+    Raises
+    ------
+    KeyError
+        If no element with the given name exists.
+    """
     if key.isdigit() or (key.startswith("-") and key[1:].isdigit()):
         return int(key)
     for i, item in enumerate(items):
@@ -106,6 +252,19 @@ def _find_in_list(items: list[Any], key: str, *, where: str) -> int:
 
 
 def _coerce(value: str) -> Any:
+    """Convert a string CLI override value to bool, int, float, or leave as str.
+
+    Parameters
+    ----------
+    value : str
+        Right-hand side of ``KEY=VALUE``.
+
+    Returns
+    -------
+    Any
+        ``True``/``False`` for ``"true"``/``"false"`` (case-insensitive);
+        ``int`` or ``float`` if parseable; otherwise the original string.
+    """
     if value.lower() in ("true", "false"):
         return value.lower() == "true"
     try:
@@ -120,10 +279,27 @@ def _coerce(value: str) -> Any:
 
 
 def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]:
-    """Expand ``${VAR}`` in every string. Env wins; defaults block fills gaps.
-
-    Defaults are resolved in TOML order, so later defaults can reference earlier ones
-    (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``).
+    """Expand ``${VAR}`` references throughout the raw preset.
+
+    Defaults are resolved in TOML order, so later defaults can reference
+    earlier ones (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``). Process
+    environment variables take precedence over the ``[defaults]`` block.
+
+    Parameters
+    ----------
+    raw : dict of str to Any
+        Parsed TOML, after override application.
+
+    Returns
+    -------
+    dict of str to Any
+        New dict with all string values fully expanded and ``defaults``
+        replaced with the resolved values.
+
+    Raises
+    ------
+    KeyError
+        If any ``${VAR}`` cannot be resolved.
     """
     defaults: dict[str, str] = dict(raw.get("defaults", {}))
     accumulated: dict[str, str] = dict(os.environ)
@@ -140,6 +316,20 @@ def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]:
 
 
 def _walk(obj: Any, env: dict[str, str]) -> Any:
+    """Recursively expand ``${VAR}`` in every string within ``obj``.
+
+    Parameters
+    ----------
+    obj : Any
+        Arbitrary nested dict/list/scalar.
+    env : dict of str to str
+        Resolved variable map.
+
+    Returns
+    -------
+    Any
+        Structurally identical copy with strings expanded.
+    """
     if isinstance(obj, dict):
         return {k: _walk(v, env) for k, v in obj.items()}
     if isinstance(obj, list):
@@ -150,6 +340,26 @@ def _walk(obj: Any, env: dict[str, str]) -> Any:
 
 
 def _expand(text: str, env: dict[str, str]) -> str:
+    """Substitute ``${VAR}`` references in ``text`` until a fixed point.
+
+    Parameters
+    ----------
+    text : str
+        String potentially containing ``${VAR}`` references.
+    env : dict of str to str
+        Variable map.
+
+    Returns
+    -------
+    str
+        Fully expanded string.
+
+    Raises
+    ------
+    KeyError
+        If a referenced variable is not in ``env``.
+    """
+
     def repl(match: re.Match[str]) -> str:
         var = match.group(1)
         if var not in env:
@@ -165,6 +375,26 @@ def repl(match: re.Match[str]) -> str:
 
 
 def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
+    """Construct a :class:`Preset` from a resolved raw dict.
+
+    Parameters
+    ----------
+    name : str
+        Preset name (assigned to :attr:`Preset.name`).
+    raw : dict of str to Any
+        Resolved TOML.
+
+    Returns
+    -------
+    Preset
+        Validated preset.
+
+    Raises
+    ------
+    ValueError
+        If ``raw['jobs']`` is not a list, or if any :class:`Job` /
+        :class:`Preset` invariant fails (see their docstrings).
+    """
     raw_jobs = raw.get("jobs", [])
     if not isinstance(raw_jobs, list):
         raise ValueError(f"Preset {name!r}: 'jobs' must be a list")
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 4e273afa..aaf32f30 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -18,8 +18,22 @@
 GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
 
 
-@dataclass
+@dataclass(frozen=True)
 class JobInvocation:
+    """The fully resolved command to launch for one job.
+
+    Parameters
+    ----------
+    job : Job
+        Originating :class:`Job` (kept for introspection in logs).
+    argv : list of str
+        Subprocess command line (starts with ``pixi run -e <env> python ...``).
+    env : dict of str to str
+        Process environment, including ``CUDA_VISIBLE_DEVICES``.
+    log_path : Path
+        File to tee stdout+stderr into.
+    """
+
     job: Job
     argv: list[str]
     env: dict[str, str]
@@ -27,7 +41,24 @@ class JobInvocation:
 
 
 def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]:
-    """Build the subprocess argv + env + log path for every job in the preset."""
+    """Build the subprocess invocation for every job in the preset.
+
+    Per-job ``args`` are merged on top of :attr:`Preset.shared_args`, with
+    ``--output-dir`` auto-injected from ``results_dir / job.output_subdir`` if
+    not already present.
+
+    Parameters
+    ----------
+    preset : Preset
+        Resolved preset to launch.
+    results_dir : Path
+        Root directory for outputs and per-job log files.
+
+    Returns
+    -------
+    list of JobInvocation
+        One :class:`JobInvocation` per job, in declaration order.
+    """
     invocations: list[JobInvocation] = []
     for job in preset.jobs:
         args = preset.effective_args(job)
@@ -40,6 +71,23 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
 
 
 def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
+    """Assemble the ``pixi run`` argv list for one job's args dict.
+
+    ``True`` bools become bare flags, ``False``/``None`` are dropped, all other
+    values are stringified.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Pixi environment name passed to ``-e``.
+    args : dict of str to Any
+        Flag-name to value map (kebab-case keys, no leading ``--``).
+
+    Returns
+    -------
+    list of str
+        Subprocess argv.
+    """
     argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT]
     for key, value in args.items():
         flag = f"--{key}"
@@ -54,7 +102,26 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
 
 
 def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
-    """Launch every job in parallel; tee output to per-job logs; return 0 iff all succeed."""
+    """Launch every job in parallel and wait for completion.
+
+    Stdout+stderr from each job is teed to a per-job log file under
+    ``results_dir`` and also echoed to the driver's stderr with a ``[job_name]``
+    prefix.
+
+    Parameters
+    ----------
+    preset : Preset
+        Preset to launch.
+    results_dir : Path
+        Root directory for outputs and logs. Created if missing.
+    dry_run : bool, optional
+        If True, print the resolved commands instead of launching anything.
+
+    Returns
+    -------
+    int
+        ``0`` if all jobs exited 0 (or ``dry_run`` was set), ``1`` otherwise.
+    """
     results_dir.mkdir(parents=True, exist_ok=True)
     invocations = build_invocations(preset, results_dir=results_dir)
 
@@ -75,7 +142,14 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
 
 
 def _terminate_all(jobs: list[_RunningJob]) -> None:
-    """Terminate any already-launched jobs (used when a later spawn fails)."""
+    """Terminate any already-launched jobs (used when a later spawn fails).
+
+    Parameters
+    ----------
+    jobs : list of _RunningJob
+        Jobs whose subprocesses should be SIGTERM'd, waited on, and whose tee
+        threads should be joined.
+    """
     for j in jobs:
         if j.proc.poll() is None:
             j.proc.terminate()
@@ -85,6 +159,13 @@ def _terminate_all(jobs: list[_RunningJob]) -> None:
 
 
 def _print_dry_run(inv: JobInvocation) -> None:
+    """Print the exact command for one job without launching it.
+
+    Parameters
+    ----------
+    inv : JobInvocation
+        Invocation to print.
+    """
     print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
     print(f"# log: {inv.log_path}", file=sys.stderr)
     print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}")
@@ -92,6 +173,15 @@ def _print_dry_run(inv: JobInvocation) -> None:
 
 
 def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> None:
+    """Print a banner describing what is about to be launched.
+
+    Parameters
+    ----------
+    preset : Preset
+        Preset being launched.
+    invocations : list of JobInvocation
+        Jobs about to be spawned.
+    """
     bar = "=" * 60
     print(bar, file=sys.stderr)
     print(f"preset: {preset.name}", file=sys.stderr)
@@ -105,14 +195,44 @@ def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> N
     print(bar, file=sys.stderr)
 
 
-@dataclass
+@dataclass(frozen=True)
 class _RunningJob:
+    """Internal handle: a spawned subprocess and its log-tee thread.
+
+    Parameters
+    ----------
+    inv : JobInvocation
+        Originating invocation.
+    proc : subprocess.Popen
+        The subprocess (PIPE'd stdout merged with stderr).
+    tee_thread : threading.Thread
+        Daemon thread copying ``proc.stdout`` to the log file and to
+        ``sys.stderr`` with a per-job prefix.
+    """
+
     inv: JobInvocation
     proc: subprocess.Popen[bytes]
     tee_thread: threading.Thread
 
 
 def _spawn(inv: JobInvocation) -> _RunningJob:
+    """Start one subprocess and a thread to tee its output.
+
+    Parameters
+    ----------
+    inv : JobInvocation
+        Invocation to spawn.
+
+    Returns
+    -------
+    _RunningJob
+        Handle covering the subprocess and the tee thread.
+
+    Raises
+    ------
+    OSError
+        Propagated if the subprocess fails to start (e.g. binary missing).
+    """
     inv.log_path.parent.mkdir(parents=True, exist_ok=True)
     log_file = open(inv.log_path, "wb")
     try:
@@ -138,6 +258,18 @@ def _spawn(inv: JobInvocation) -> _RunningJob:
 
 
 def _wait_all(jobs: list[_RunningJob]) -> int:
+    """Wait for every job to exit and aggregate their exit codes.
+
+    Parameters
+    ----------
+    jobs : list of _RunningJob
+        Jobs to wait on.
+
+    Returns
+    -------
+    int
+        ``0`` if all jobs exited 0, ``1`` if any failed.
+    """
     failures = 0
     for j in jobs:
         exit_code = j.proc.wait()
@@ -151,6 +283,18 @@ def _wait_all(jobs: list[_RunningJob]) -> int:
 
 
 def _tee(prefix: str, src: Any, dest: Any) -> None:
+    """Copy bytes from ``src`` to ``dest`` and to stderr with a label.
+
+    Parameters
+    ----------
+    prefix : str
+        Per-line label prepended to the stderr echo (e.g. job name).
+    src : file-like
+        Readable byte stream (typically ``Popen.stdout`` with stderr merged).
+    dest : file-like
+        Writable byte stream for the on-disk log file. Closed when ``src`` is
+        exhausted.
+    """
     for line in iter(src.readline, b""):
         dest.write(line)
         dest.flush()
@@ -160,8 +304,21 @@ def _tee(prefix: str, src: Any, dest: Any) -> None:
 
 
 def _ts() -> str:
+    """Return the current local time as a ``YYYY-MM-DD HH:MM:SS`` string."""
     return time.strftime("%Y-%m-%d %H:%M:%S")
 
 
 def _shell_join(argv: list[str]) -> str:
+    """Quote ``argv`` so the result can be pasted into a POSIX shell.
+
+    Parameters
+    ----------
+    argv : list of str
+        Argument vector.
+
+    Returns
+    -------
+    str
+        Single shell-quoted command line.
+    """
     return shlex.join(argv)
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index 2f50a3e9..6e1b768a 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -14,8 +14,37 @@
 VALID_PIXI_ENVS = ("boltz", "protenix", "rf3")
 
 
-@dataclass
+@dataclass(frozen=True)
 class Job:
+    """One parallel `run_grid_search.py` invocation within a preset.
+
+    Parameters
+    ----------
+    name : str
+        Identifier used for per-job log files and ``--only`` selection. Must be
+        unique within the parent :class:`Preset`.
+    env : str
+        Pixi environment to run the job in. Must be one of
+        :data:`VALID_PIXI_ENVS`.
+    gpus : str
+        Value to set as ``CUDA_VISIBLE_DEVICES`` for the subprocess (e.g.
+        ``"4"`` or ``"0,1"``).
+    output_subdir : str
+        Path appended to the run's ``results_dir`` to form the job's
+        ``--output-dir`` argument, when one is not given explicitly in ``args``.
+    args : dict of str to Any, optional
+        Per-job overrides merged on top of the preset's
+        :attr:`Preset.shared_args`. Keys are CLI flag names (without the
+        leading ``--``); bools become bare flags (``True``) or omitted
+        (``False``).
+
+    Raises
+    ------
+    ValueError
+        If ``env`` is not in :data:`VALID_PIXI_ENVS`, or if ``gpus`` /
+        ``output_subdir`` is empty.
+    """
+
     name: str
     env: str
     gpus: str
@@ -23,6 +52,7 @@ class Job:
     args: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self) -> None:
+        """Validate ``env`` and required string fields."""
         if self.env not in VALID_PIXI_ENVS:
             raise ValueError(
                 f"Job {self.name!r}: env must be one of {VALID_PIXI_ENVS}, got {self.env!r}"
@@ -33,8 +63,32 @@ def __post_init__(self) -> None:
             raise ValueError(f"Job {self.name!r}: output_subdir must be non-empty")
 
 
-@dataclass
+@dataclass(frozen=True)
 class Preset:
+    """A named bundle of parallel jobs orchestrated as a unit.
+
+    Parameters
+    ----------
+    name : str
+        Identifier (matches the bundled TOML filename without the ``.toml``
+        suffix, or the stem of a user-supplied path).
+    description : str
+        Human-readable summary shown by ``--list`` and the launch banner.
+    defaults : dict of str to str, optional
+        Default values for ``${VAR}`` interpolation. The process environment
+        takes precedence; this block only fills in unset keys.
+    shared_args : dict of str to Any, optional
+        Args merged into every job's ``args`` before argv is built. Per-job
+        ``args`` win on collision.
+    jobs : list of Job
+        Jobs to launch in parallel. Must be non-empty and have unique names.
+
+    Raises
+    ------
+    ValueError
+        If ``jobs`` is empty or contains duplicate names.
+    """
+
     name: str
     description: str
     defaults: dict[str, str] = field(default_factory=dict)
@@ -42,6 +96,7 @@ class Preset:
     jobs: list[Job] = field(default_factory=list)
 
     def __post_init__(self) -> None:
+        """Validate the job list is non-empty and names are unique."""
         if not self.jobs:
             raise ValueError(f"Preset {self.name!r}: must declare at least one job")
         seen: set[str] = set()
@@ -51,11 +106,39 @@ def __post_init__(self) -> None:
             seen.add(job.name)
 
     def job(self, name: str) -> Job:
+        """Return the :class:`Job` with the given name.
+
+        Parameters
+        ----------
+        name : str
+            Job name to look up.
+
+        Returns
+        -------
+        Job
+            The matching job.
+
+        Raises
+        ------
+        KeyError
+            If no job has the given name.
+        """
         for j in self.jobs:
             if j.name == name:
                 return j
         raise KeyError(f"Preset {self.name!r} has no job {name!r}")
 
     def effective_args(self, job: Job) -> dict[str, Any]:
-        """Return ``shared_args`` merged with per-job overrides."""
+        """Merge :attr:`shared_args` with a job's per-job overrides.
+
+        Parameters
+        ----------
+        job : Job
+            Job whose ``args`` override the shared defaults.
+
+        Returns
+        -------
+        dict of str to Any
+            New dict; mutating it does not affect the preset.
+        """
         return {**self.shared_args, **job.args}

From ce82079f6f4cee558f2511b846e51b725d3400db Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Mon, 18 May 2026 23:14:40 -0700
Subject: [PATCH 06/28] fix(runs): point presets at /mnt/diffuse-shared paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Match the actual mounted layout on the ACTL pod:
  /mnt/diffuse-shared/raw/sampleworks/
    ├── initial_dataset_40/                 # DATA_DIR for all_models
    ├── initial_dataset_40_occ_sweeps/      # DATA_DIR for occ-sweep presets
    ├── actl_msa_cache/                     # MSA_CACHE_DIR for all presets
    └── actl_results/<preset_name>/         # RESULTS_DIR namespaced per preset

Per-preset RESULTS_DIR avoids cross-preset collisions when multiple
presets share output_subdirs (e.g. all_models and rf3_partial both have
a job named "rf3").

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sampleworks/runs/presets/all_models.toml             | 6 +++---
 src/sampleworks/runs/presets/protenix_dual.toml          | 6 +++---
 src/sampleworks/runs/presets/rf3_partial.toml            | 6 +++---
 src/sampleworks/runs/presets/rf3_partial_chiral_off.toml | 6 +++---
 src/sampleworks/runs/presets/rf3_protenix.toml           | 6 +++---
 tests/runs/test_loader.py                                | 2 +-
 tests/runs/test_runner.py                                | 2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml
index 21d1cd92..4ed1bcf7 100644
--- a/src/sampleworks/runs/presets/all_models.toml
+++ b/src/sampleworks/runs/presets/all_models.toml
@@ -1,9 +1,9 @@
 description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)."
 
 [defaults]
-DATA_DIR = "/data/input"
-RESULTS_DIR = "/data/results"
-MSA_CACHE_DIR = "${HOME}/.sampleworks/msa"
+DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40"
+RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/all_models"
+MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 
 [shared_args]
diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml
index 461d547e..1f4c0e36 100644
--- a/src/sampleworks/runs/presets/protenix_dual.toml
+++ b/src/sampleworks/runs/presets/protenix_dual.toml
@@ -1,9 +1,9 @@
 description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results"
-MSA_CACHE_DIR = "/data/sampleworks-exp/msa_cache"
+DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/protenix_dual"
+MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt"
 PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt"
diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml
index 533accfb..911552e1 100644
--- a/src/sampleworks/runs/presets/rf3_partial.toml
+++ b/src/sampleworks/runs/presets/rf3_partial.toml
@@ -1,9 +1,9 @@
 description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results"
-MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial"
+MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
 
diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
index bd3a1311..2f1e1817 100644
--- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
+++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
@@ -1,9 +1,9 @@
 description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results_rf3_chiral_off"
-MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial_chiral_off"
+MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
 
diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml
index 9255cfea..32d2eb23 100644
--- a/src/sampleworks/runs/presets/rf3_protenix.toml
+++ b/src/sampleworks/runs/presets/rf3_protenix.toml
@@ -1,9 +1,9 @@
 description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results"
-MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache"
+DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_protenix"
+MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 
 [shared_args]
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index d8d6509c..13379a41 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -40,7 +40,7 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial")
-    expected = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+    expected = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
     assert preset.defaults["DATA_DIR"] == expected
 
 
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 050d7458..635a8c28 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -26,7 +26,7 @@ def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> N
     assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"]
     pairs = _argv_to_dict(argv[6:])
     assert pairs["--proteins"] == (
-        "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv"
+        "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv"
     )
     assert pairs["--model"] == "rf3"
     assert pairs["--scalers"] == "pure_guidance"

From 09c367bcccb284164f1980190a97a141cceff055 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Tue, 19 May 2026 11:35:08 -0700
Subject: [PATCH 07/28] fix(runs): align preset paths with
 actl_setup_sampleworks_paths.sh

The proteins.csv files in the shared dataset reference /data/inputs/
paths that only resolve after the pod-init script
(actl_setup_sampleworks_paths.sh) creates the canonical symlinks. Point
presets at those canonical paths so everything stays consistent:

  DATA_DIR          /data/input | /data/inputs
  RESULTS_DIR       /data/results/<preset_name>
  MSA_CACHE_DIR     /root/.sampleworks

/data/results is namespaced by the pod-init script via
$SAMPLEWORKS_ACTL_RUN_NAME (defaults to $HOSTNAME), so per-preset
subdirs sit inside a per-session root.

README documents the one-time setup script step.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                                                | 8 ++++++++
 src/sampleworks/runs/presets/all_models.toml             | 6 +++---
 src/sampleworks/runs/presets/protenix_dual.toml          | 6 +++---
 src/sampleworks/runs/presets/rf3_partial.toml            | 6 +++---
 src/sampleworks/runs/presets/rf3_partial_chiral_off.toml | 6 +++---
 src/sampleworks/runs/presets/rf3_protenix.toml           | 6 +++---
 tests/runs/test_loader.py                                | 3 +--
 tests/runs/test_runner.py                                | 4 +---
 8 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index efdcc537..bfa49541 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,14 @@ Instructions for running evaluation and metrics scripts are coming soon.
 
 For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes.
 
+**Pod-side prerequisite.** Bundled presets reference the canonical `/data/inputs`, `/data/results`, and `/root/.sampleworks` paths set up by the ACTL pod-init script. On a fresh sampleworks pod, run once per session:
+
+```bash
+bash /mnt/diffuse-shared/raw/sampleworks/actl_setup_sampleworks_paths.sh
+```
+
+That creates symlinks pointing the canonical paths at the shared mount (and namespaces `/data/results` by hostname or `$SAMPLEWORKS_ACTL_RUN_NAME`). Overrides via env var (`DATA_DIR=...`) or CLI (`--set defaults.DATA_DIR=...`) work without the symlinks.
+
 ```bash
 pixi run -e rf3 sampleworks-runs --list                          # bundled presets
 pixi run -e rf3 sampleworks-runs rf3_partial                     # run a preset
diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml
index 4ed1bcf7..72701997 100644
--- a/src/sampleworks/runs/presets/all_models.toml
+++ b/src/sampleworks/runs/presets/all_models.toml
@@ -1,9 +1,9 @@
 description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40"
-RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/all_models"
-MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
+DATA_DIR = "/data/input"
+RESULTS_DIR = "/data/results/all_models"
+MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 
 [shared_args]
diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml
index 1f4c0e36..231ee220 100644
--- a/src/sampleworks/runs/presets/protenix_dual.toml
+++ b/src/sampleworks/runs/presets/protenix_dual.toml
@@ -1,9 +1,9 @@
 description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/protenix_dual"
-MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/protenix_dual"
+MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt"
 PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt"
diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml
index 911552e1..60a063e1 100644
--- a/src/sampleworks/runs/presets/rf3_partial.toml
+++ b/src/sampleworks/runs/presets/rf3_partial.toml
@@ -1,9 +1,9 @@
 description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial"
-MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/rf3_partial"
+MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
 
diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
index 2f1e1817..af0e5ac8 100644
--- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
+++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
@@ -1,9 +1,9 @@
 description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial_chiral_off"
-MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/rf3_partial_chiral_off"
+MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
 
diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml
index 32d2eb23..4ca5638d 100644
--- a/src/sampleworks/runs/presets/rf3_protenix.toml
+++ b/src/sampleworks/runs/presets/rf3_protenix.toml
@@ -1,9 +1,9 @@
 description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)."
 
 [defaults]
-DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_protenix"
-MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache"
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/rf3_protenix"
+MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 
 [shared_args]
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 13379a41..6528a602 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -40,8 +40,7 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial")
-    expected = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-    assert preset.defaults["DATA_DIR"] == expected
+    assert preset.defaults["DATA_DIR"] == "/data/inputs"
 
 
 def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 635a8c28..39aefcb2 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -25,9 +25,7 @@ def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> N
     argv = inv.argv
     assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"]
     pairs = _argv_to_dict(argv[6:])
-    assert pairs["--proteins"] == (
-        "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv"
-    )
+    assert pairs["--proteins"] == "/data/inputs/proteins.csv"
     assert pairs["--model"] == "rf3"
     assert pairs["--scalers"] == "pure_guidance"
     assert pairs["--partial-diffusion-step"] == "120"

From ae77784ceeb10c1408b1d051ba0b24175ae09f85 Mon Sep 17 00:00:00 2001
From: Magomed Abdurakhmanov <magomed@astera.org>
Date: Tue, 19 May 2026 11:39:06 -0700
Subject: [PATCH 08/28] fix(runs): mkdir the per-job --output-dir before
 launching

run_grid_search.py writes its work-queue pickle (wjq_*.pkl) directly
into --output-dir without creating it first, so the orchestrator must.
Previously only results_dir itself was mkdir'd, which broke any preset
whose output_subdir didn't already exist on disk.

JobInvocation now carries the resolved output_dir explicitly so _spawn
can mkdir it alongside the log directory. --dry-run still doesn't touch
the filesystem.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sampleworks/runs/runner.py | 10 +++++++++-
 tests/runs/test_runner.py      | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index aaf32f30..6a5c6fbb 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -32,12 +32,16 @@ class JobInvocation:
         Process environment, including ``CUDA_VISIBLE_DEVICES``.
     log_path : Path
         File to tee stdout+stderr into.
+    output_dir : Path
+        Resolved ``--output-dir`` value (mkdir'd by the runner before launch
+        because ``run_grid_search.py`` assumes its existence).
     """
 
     job: Job
     argv: list[str]
     env: dict[str, str]
     log_path: Path
+    output_dir: Path
 
 
 def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]:
@@ -66,7 +70,10 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
         argv = _build_argv(job.env, args)
         env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus}
         log_path = results_dir / f"{job.name}_run.log"
-        invocations.append(JobInvocation(job=job, argv=argv, env=env, log_path=log_path))
+        output_dir = Path(args["output-dir"])
+        invocations.append(
+            JobInvocation(job=job, argv=argv, env=env, log_path=log_path, output_dir=output_dir)
+        )
     return invocations
 
 
@@ -234,6 +241,7 @@ def _spawn(inv: JobInvocation) -> _RunningJob:
         Propagated if the subprocess fails to start (e.g. binary missing).
     """
     inv.log_path.parent.mkdir(parents=True, exist_ok=True)
+    inv.output_dir.mkdir(parents=True, exist_ok=True)
     log_file = open(inv.log_path, "wb")
     try:
         proc = subprocess.Popen(
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 39aefcb2..61c5b758 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -92,6 +92,27 @@ def test_rf3_partial_chiral_off_flag_present(monkeypatch: pytest.MonkeyPatch) ->
     assert "--force-all" in inv.argv
 
 
+def test_build_invocations_records_output_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+    """`run_grid_search.py` assumes its --output-dir exists; the runner must mkdir it."""
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("rf3_partial")
+    inv = runner.build_invocations(preset, results_dir=Path("/r"))[0]
+    assert inv.output_dir == Path("/r/rf3")
+
+
+def test_dry_run_does_not_create_directories(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """--dry-run prints commands but never touches the filesystem."""
+    monkeypatch.setenv("HOME", str(tmp_path))
+    results_dir = tmp_path / "results"
+    preset = loader.load_preset("rf3_partial")
+    runner.run(preset, results_dir=results_dir, dry_run=True)
+    # results_dir gets created by run() (for log file location) but per-job
+    # output subdirs must NOT exist after dry-run.
+    assert not (results_dir / "rf3").exists()
+
+
 def _argv_to_dict(tail: list[str]) -> dict[str, object]:
     """Turn ``[--a, 1, --b, --c, 2]`` into ``{'--a': '1', '--b': True, '--c': '2'}``."""
     out: dict[str, object] = {}

From 2da1e2dffb9ab646a8189b189aed03b9693222b0 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Thu, 21 May 2026 12:15:01 -0400
Subject: [PATCH 09/28] fix(runs): make ACTL sampleworks image self-contained

---
 .actlignore                                   |  13 +
 Dockerfile                                    |  14 +-
 README.md                                     |  47 ++-
 run_all_models.sh                             | 307 ++++++++----------
 run_grid_search.py                            |  78 ++++-
 src/sampleworks/models/boltz/wrapper.py       |  15 +-
 src/sampleworks/runs/presets/all_models.toml  |   2 +-
 src/sampleworks/runs/runner.py                | 147 ++++++++-
 src/sampleworks/runs/schema.py                |   2 +-
 .../utils/guidance_script_arguments.py        |  35 +-
 .../utils/guidance_script_utils.py            |   8 +-
 tests/runs/conftest.py                        |  11 +
 tests/runs/test_loader.py                     |   9 +
 tests/runs/test_runner.py                     |  34 ++
 14 files changed, 517 insertions(+), 205 deletions(-)
 create mode 100644 .actlignore
 create mode 100644 tests/runs/conftest.py

diff --git a/.actlignore b/.actlignore
new file mode 100644
index 00000000..047b4971
--- /dev/null
+++ b/.actlignore
@@ -0,0 +1,13 @@
+# Keep ACTL sync focused on source. Large data/results should live under
+# /mnt/diffuse-shared or the pod home PVC, not in the synced checkout.
+.pixi/
+grid_search_results/
+outputs/
+data/
+initial_dataset_40*/
+checkpoints/
+release_data/
+*.ckpt
+*.pt
+*.tar.gz
+*.tgz
diff --git a/Dockerfile b/Dockerfile
index a51f8651..c7ed94d7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,8 +7,8 @@
 # Build:
 #   docker build -t sampleworks .
 #
-# CI builds pull checkpoints automatically from Docker Hub via:
-#   COPY --from=diffuseproject/sampleworks-checkpoints:latest
+# CI builds pull checkpoints automatically from Harbor via:
+#   COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest
 # No checkpoint files are needed in the build context or on the CI runner.
 #
 # To rebuild the checkpoints base image (only needed when checkpoints change):
@@ -56,7 +56,7 @@
 #   /checkpoints/protenix_base_default_v0.5.0.pt     - Protenix model (~1.4GB)
 #
 # Checkpoints base image:
-#   All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub.
+#   All checkpoints live in harbor.astera.sh/library/sampleworks-checkpoints:latest.
 #   To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server.
 
 # ============================================================================
@@ -108,7 +108,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh
 # ============================================================================
 # Checkpoints (~10 GB) rarely change, so this layer is placed before pixi
 # installs to stay cached even when dependencies update.
-COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/
+COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/
 
 # ============================================================================
 # Install all three environments: boltz, protenix, rf3
@@ -129,6 +129,12 @@ RUN pixi run -e boltz python -c "\
 from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \
 print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)"
 
+COPY run_all_models.sh ./
+RUN chmod +x /app/run_all_models.sh \
+    && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \
+    && chmod +x /usr/local/bin/run_all_models.sh \
+    && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n    cd /app\nfi\n' >> /root/.bashrc
+
 # Set default checkpoint paths via environment variables
 ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \
     BOLTZ2_CHECKPOINT=/checkpoints/boltz2_conf.ckpt \
diff --git a/README.md b/README.md
index bfa49541..dade48e7 100644
--- a/README.md
+++ b/README.md
@@ -152,35 +152,50 @@ Output layout: `grid_search_results/<protein>/<model>[_<method>]/<scaler>/ens<N>
 Instructions for running evaluation and metrics scripts are coming soon.
 
 
-## Preset experiments (`sampleworks-runs`)
+## ACTL preset experiments (`run_all_models.sh` / `sampleworks-runs`)
 
-For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes.
+For canonical multi-model/multi-GPU sweeps, `sampleworks-runs` orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares the model, pixi env, GPU assignment, output subdir, and CLI args. The runner launches jobs in parallel, tees per-job logs, and aggregates exit codes.
 
-**Pod-side prerequisite.** Bundled presets reference the canonical `/data/inputs`, `/data/results`, and `/root/.sampleworks` paths set up by the ACTL pod-init script. On a fresh sampleworks pod, run once per session:
+On ACTL, start the pod with the prebuilt image and shared storage, then run one command inside the pod shell:
 
 ```bash
-bash /mnt/diffuse-shared/raw/sampleworks/actl_setup_sampleworks_paths.sh
+actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+
+# inside the ACTL pod shell
+# the sampleworks image drops interactive shells in /app
+run_all_models.sh --dry-run   # inspect commands first
+run_all_models.sh             # run /app/src/sampleworks/runs/presets/all_models.toml
 ```
 
-That creates symlinks pointing the canonical paths at the shared mount (and namespaces `/data/results` by hostname or `$SAMPLEWORKS_ACTL_RUN_NAME`). Overrides via env var (`DATA_DIR=...`) or CLI (`--set defaults.DATA_DIR=...`) work without the symlinks.
+The wrapper keeps the TOML preset as the source of truth. It only supplies ACTL-friendly defaults:
+
+- `DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps`
+- `RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks/<pod>/all_models`
+- `MSA_CACHE_DIR=/mnt/diffuse-shared/cache/sampleworks/msa`
+- `PYTHONPATH=/app/src`, using the copy baked into the sampleworks image
+- direct `/app/.pixi/envs/<env>/bin/python` execution, so it reuses the environments baked into the sampleworks image without refreshing pixi caches
+- `/tmp` pixi/uv caches for any missing environment preparation, avoiding shared-storage Git cache issues
+
+Common commands:
 
 ```bash
-pixi run -e rf3 sampleworks-runs --list                          # bundled presets
-pixi run -e rf3 sampleworks-runs rf3_partial                     # run a preset
-pixi run -e rf3 sampleworks-runs rf3_partial --show              # inspect resolved values
-pixi run -e rf3 sampleworks-runs rf3_partial --dry-run           # print pixi run commands, don't execute
-pixi run -e rf3 sampleworks-runs all_models --only rf3,protenix  # subset jobs
-
-# Override any value without editing the TOML:
-pixi run -e rf3 sampleworks-runs rf3_partial \
-    --set jobs.rf3.gpus=7 \
+run_all_models.sh --list                         # bundled presets
+run_all_models.sh all_models --show              # inspect resolved values
+run_all_models.sh all_models --only rf3,protenix # subset jobs
+run_all_models.sh rf3_partial                    # run a smaller preset
+
+# Override paths or parameters without editing TOML:
+DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/my_dataset run_all_models.sh rf3_partial
+run_all_models.sh rf3_partial \
+    --set jobs.rf3.gpus=0 \
     --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
 ```
 
-Bundled presets live in `src/sampleworks/runs/presets/*.toml`. Add a new preset by dropping a `.toml` file alongside them or pointing at any path:
+Bundled presets live in `src/sampleworks/runs/presets/*.toml`. You can also copy one, edit it, and run it by path:
 
 ```bash
-sampleworks-runs ./my_experiment.toml
+cp src/sampleworks/runs/presets/all_models.toml my_experiment.toml
+run_all_models.sh ./my_experiment.toml
 ```
 
 Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block.
diff --git a/run_all_models.sh b/run_all_models.sh
index 5b90f81f..96b032b0 100755
--- a/run_all_models.sh
+++ b/run_all_models.sh
@@ -1,164 +1,145 @@
-#!/bin/bash
-# Run all 4 model grid searches in parallel, 2 GPUs each
-# Total: 8 GPUs used (4 jobs x 2 GPUs each)
+#!/usr/bin/env bash
+# ACTL-native entry point for Sampleworks preset runs.
 #
-# Models:
-#   - Boltz2 X-ray diffraction (GPUs 0,1)
-#   - Boltz2 MD               (GPUs 2,3)
-#   - RosettaFold3             (GPUs 4,5)
-#   - Protenix                 (GPUs 6,7)
-#
-# Checkpoints are BAKED INTO the Docker image at /checkpoints/.
-# If missing, the code auto-falls back to mounted paths.
-#
-# Usage:
-#   ./run_all_models.sh
-
-set -e
-
-# Configuration
-DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
-RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}"
-MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}"
-
-# Create directories
-mkdir -p "$RESULTS_DIR"
-mkdir -p "$MSA_CACHE_DIR"
-
-# Pull latest image (no-op if already up to date)
-echo "Pulling latest Docker image..."
-docker pull diffuseproject/sampleworks:latest
-
-# Common docker options
-DOCKER_OPTS="--rm --shm-size=16g"
-
-echo "=========================================="
-echo "Starting all model grid searches (4 jobs x 2 GPUs)"
-echo "Data: $DATA_DIR"
-echo "Results: $RESULTS_DIR"
-echo "MSA Cache: $MSA_CACHE_DIR"
-echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)"
-echo ""
-echo "Models:"
-echo "  - Boltz2 X-ray (GPUs 0,1)"
-echo "  - Boltz2 MD    (GPUs 2,3)"
-echo "  - RF3          (GPUs 4,5)"
-echo "  - Protenix     (GPUs 6,7)"
-echo "=========================================="
-
-PIDS=()
-
-# --- Boltz2 X-ray Diffraction (GPUs 0,1) ---
-echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1"
-docker run $DOCKER_OPTS \
-    --gpus '"device=0,1"' \
-    -v "$DATA_DIR:/data/inputs:ro" \
-    -v "$RESULTS_DIR:/data/results" \
-    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
-    -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \
-    -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \
-    diffuseproject/sampleworks:latest \
-    -e boltz run_grid_search.py \
-    --proteins "/data/inputs/proteins.csv" \
-    --model boltz2 \
-    --method "X-RAY DIFFRACTION" \
-    --scalers pure_guidance \
-    --partial-diffusion-step 120 \
-    --ensemble-sizes "8" \
-    --gradient-weights "0.1 0.2 0.5" \
-    --gradient-normalization --augmentation --align-to-input \
-    --output-dir /data/results \
-    2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" &
-PIDS+=($!)
-echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})"
-
-# --- Boltz2 MD (GPUs 2,3) ---
-echo "[$(date)] Starting Boltz2 MD on GPUs 2,3"
-docker run $DOCKER_OPTS \
-    --gpus '"device=2,3"' \
-    -v "$DATA_DIR:/data/inputs:ro" \
-    -v "$RESULTS_DIR:/data/results" \
-    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
-    -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \
-    -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \
-    diffuseproject/sampleworks:latest \
-    -e boltz run_grid_search.py \
-    --proteins "/data/inputs/proteins.csv" \
-    --model boltz2 \
-    --method "MD" \
-    --scalers pure_guidance \
-    --partial-diffusion-step 120 \
-    --ensemble-sizes "8" \
-    --gradient-weights "0.1 0.2 0.5" \
-    --gradient-normalization --augmentation --align-to-input \
-    --output-dir /data/results \
-    2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" &
-PIDS+=($!)
-echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})"
-
-# --- RosettaFold3 (GPUs 4,5) ---
-echo "[$(date)] Starting RosettaFold3 on GPUs 4,5"
-docker run $DOCKER_OPTS \
-    --gpus '"device=4,5"' \
-    -v "$DATA_DIR:/data/inputs:ro" \
-    -v "$RESULTS_DIR:/data/results" \
-    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
-    -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \
-    -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \
-    diffuseproject/sampleworks:latest \
-    -e rf3 run_grid_search.py \
-    --proteins "/data/inputs/proteins.csv" \
-    --model rf3 \
-    --partial-diffusion-step 120 \
-    --scalers pure_guidance \
-    --ensemble-sizes "8" \
-    --gradient-weights "0.01 0.02 0.05" \
-    --gradient-normalization --augmentation --align-to-input \
-    --output-dir /data/results \
-    2>&1 | tee "$RESULTS_DIR/rf3_run.log" &
-PIDS+=($!)
-echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})"
-
-# --- Protenix (GPUs 6,7) ---
-echo "[$(date)] Starting Protenix on GPUs 6,7"
-docker run $DOCKER_OPTS \
-    --gpus '"device=6,7"' \
-    -v "$DATA_DIR:/data/inputs:ro" \
-    -v "$RESULTS_DIR:/data/results" \
-    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
-    -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \
-    -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \
-    diffuseproject/sampleworks:latest \
-    -e protenix run_grid_search.py \
-    --proteins "/data/inputs/proteins.csv" \
-    --model protenix \
-    --scalers pure_guidance \
-    --partial-diffusion-step 120 \
-    --ensemble-sizes "8" \
-    --gradient-weights "0.1 0.2 0.5" \
-    --gradient-normalization --augmentation --align-to-input \
-    --output-dir /data/results \
-    2>&1 | tee "$RESULTS_DIR/protenix_run.log" &
-PIDS+=($!)
-echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})"
-
-echo ""
-echo "=========================================="
-echo "All 4 jobs launched! PIDs: ${PIDS[*]}"
-echo "Logs:"
-echo "  - $RESULTS_DIR/boltz2_xrd_run.log"
-echo "  - $RESULTS_DIR/boltz2_md_run.log"
-echo "  - $RESULTS_DIR/rf3_run.log"
-echo "  - $RESULTS_DIR/protenix_run.log"
-echo ""
-echo "Monitor GPU usage: nvidia-smi -l 1"
-echo "Waiting for all jobs to complete..."
-echo "=========================================="
-
-# Wait for all background jobs
-wait
-
-echo ""
-echo "=========================================="
-echo "[$(date)] All jobs completed!"
-echo "=========================================="
+# The TOML preset is the source of truth. This wrapper only supplies smooth
+# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH,
+# and direct use of the prebuilt pixi environments from the image at /app.
+
+set -euo pipefail
+
+script_path="${BASH_SOURCE[0]}"
+while [[ -L "$script_path" ]]; do
+    script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+    script_target="$(readlink "$script_path")"
+    if [[ "$script_target" == /* ]]; then
+        script_path="$script_target"
+    else
+        script_path="$script_dir/$script_target"
+    fi
+done
+script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}"
+
+preset="${SAMPLEWORKS_PRESET:-all_models}"
+if [[ $# -gt 0 && "$1" != -* ]]; then
+    preset="$1"
+    shift
+fi
+
+if [[ "$preset" == *.toml || "$preset" == */* ]]; then
+    if [[ "$preset" != /* ]]; then
+        preset="$repo_root/$preset"
+    fi
+fi
+preset_label="${preset##*/}"
+preset_label="${preset_label%.toml}"
+
+run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}"
+default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}"
+default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa"
+
+export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}"
+export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}"
+export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}"
+export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}"
+export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
+export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}"
+export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
+
+shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
+for checkpoint_var_and_file in \
+    "BOLTZ1_CHECKPOINT boltz1_conf.ckpt" \
+    "BOLTZ2_CHECKPOINT boltz2_conf.ckpt" \
+    "RF3_CHECKPOINT rf3_foundry_01_24_latest.ckpt" \
+    "PROTENIX_CHECKPOINT protenix_base_default_v0.5.0.pt"; do
+    read -r checkpoint_var checkpoint_file <<<"$checkpoint_var_and_file"
+    checkpoint_path="$shared_checkpoint_dir/$checkpoint_file"
+    if [[ -z "${!checkpoint_var:-}" && -f "$checkpoint_path" ]]; then
+        export "$checkpoint_var=$checkpoint_path"
+    fi
+done
+
+source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
+if [[ -f "$source_proteins_csv" ]]; then
+    # The shared proteins.csv currently contains absolute /data/inputs paths,
+    # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
+    # manifest instead of requiring non-root scientists to create /data symlinks.
+    manifest_dir="$RESULTS_DIR/_input_manifest"
+    manifest_proteins_csv="$manifest_dir/proteins.csv"
+    mkdir -p "$manifest_dir"
+    legacy_data_dir="/data/inputs"
+    while IFS= read -r line || [[ -n "$line" ]]; do
+        printf '%s\n' "${line//$legacy_data_dir/$DATA_DIR}"
+    done <"$source_proteins_csv" >"$manifest_proteins_csv"
+    export PROTEINS_CSV="$manifest_proteins_csv"
+fi
+
+runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}"
+pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}"
+if [[ -z "$pixi_project_dir" ]]; then
+    if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then
+        pixi_project_dir="/app"
+    else
+        pixi_project_dir="$repo_root"
+    fi
+fi
+runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}"
+
+needs_runtime_paths=1
+for arg in "$@"; do
+    case "$arg" in
+        --dry-run|--show|--list|-h|--help)
+            needs_runtime_paths=0
+            ;;
+    esac
+done
+
+if [[ "$needs_runtime_paths" -eq 1 ]]; then
+    if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then
+        cat >&2 <<EOF
+Sampleworks input dataset was not found.
+
+Expected: $source_proteins_csv
+
+On an ACTL sampleworks pod, make sure the diffuse-shared PVC is mounted at
+/mnt/diffuse-shared, or override the dataset path, for example:
+
+  DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/<dataset> ./run_all_models.sh
+
+EOF
+        exit 2
+    fi
+    mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR"
+fi
+
+cat >&2 <<EOF
+Sampleworks preset run
+  preset:        $preset
+  data:          $DATA_DIR
+  results:       $RESULTS_DIR
+  msa cache:     $MSA_CACHE_DIR
+  source:        $repo_root
+  pixi project:  $pixi_project_dir
+  runner env:    $runner_env
+  runner python: $runner_python
+
+EOF
+
+cd "$pixi_project_dir"
+if [[ -x "$runner_python" ]]; then
+    runner_env_dir="$(cd -- "$(dirname -- "$runner_python")/.." && pwd)"
+    export PATH="$runner_env_dir/bin${PATH:+:$PATH}"
+    export CONDA_PREFIX="$runner_env_dir"
+    export CUDA_HOME="${CUDA_HOME:-$runner_env_dir}"
+    export PYTHONNOUSERSITE=1
+    exec "$runner_python" -m sampleworks.runs.cli \
+        "$preset" \
+        --results-dir "$RESULTS_DIR" \
+        "$@"
+fi
+
+exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
+    "$preset" \
+    --results-dir "$RESULTS_DIR" \
+    "$@"
diff --git a/run_grid_search.py b/run_grid_search.py
index 66ed9c8a..189a04c4 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -223,16 +223,86 @@ def run_guidance_queue_script(args: tuple[str, int, str, int]):
     job_queue_path, max_workers, model, worker_num = args
     pixi_env = get_pixi_env(model)
     script_path = Path(__file__).parent / "scripts" / "run_guidance_pipeline.py"
-    cmd = f"pixi run -e {pixi_env} python {script_path} --job-queue-path {job_queue_path}"
-    cmd = cmd.split()
+    env_python = get_pixi_env_python(pixi_env)
+    if env_python:
+        cmd = [env_python, str(script_path), "--job-queue-path", job_queue_path]
+        env = get_pixi_env_process_env(env_python)
+    else:
+        cmd = [
+            "pixi",
+            "run",
+            "-e",
+            pixi_env,
+            "python",
+            str(script_path),
+            "--job-queue-path",
+            job_queue_path,
+        ]
+        env = os.environ.copy()
     log.info(f"Running worker {worker_num}: {cmd} on GPU {worker_num % max_workers}")
-    # env = os.environ.copy()
 
     with open(job_queue_path.replace(".pkl", ".log"), "w") as log_file:
-        result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT)
+        result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)
     return result
 
 
+def get_pixi_env_process_env(env_python: str) -> dict[str, str]:
+    """Return process environment values for a direct pixi Python executable.
+
+    Parameters
+    ----------
+    env_python : str
+        Python executable under ``.pixi/envs/<env>/bin/python``.
+
+    Returns
+    -------
+    dict of str to str
+        Environment with the env's ``bin`` directory, ``CONDA_PREFIX``, and
+        ``CUDA_HOME`` set so compiled extensions can find tools such as
+        ``ninja`` and the CUDA toolkit without going through ``pixi run``.
+    """
+    env_dir = Path(env_python).resolve().parent.parent
+    bin_dir = env_dir / "bin"
+    env = os.environ.copy()
+    env["PATH"] = f"{bin_dir}{os.pathsep}{env.get('PATH', '')}"
+    env["CONDA_PREFIX"] = str(env_dir)
+    env.setdefault("CUDA_HOME", str(env_dir))
+    env["PYTHONNOUSERSITE"] = "1"
+    return env
+
+
+def get_pixi_env_python(pixi_env: str) -> str | None:
+    """Return a direct Python binary for a preinstalled pixi environment.
+
+    The ACTL sampleworks image bakes environments under ``/app/.pixi``. Using
+    those interpreters directly avoids a runtime ``pixi run`` cache refresh on
+    shared storage. Set ``SAMPLEWORKS_FORCE_PIXI=1`` to force the old behavior.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Pixi environment name such as ``boltz``, ``protenix``, or ``rf3``.
+
+    Returns
+    -------
+    str or None
+        Path to the environment's Python executable, or ``None`` to use pixi.
+    """
+    if os.environ.get("SAMPLEWORKS_FORCE_PIXI", "").lower() in {"1", "true", "yes"}:
+        return None
+
+    env_key = pixi_env.upper().replace("-", "_")
+    override = os.environ.get(f"SAMPLEWORKS_{env_key}_PYTHON")
+    if override:
+        return override
+
+    pixi_project_dir = Path(os.environ.get("SAMPLEWORKS_PIXI_PROJECT_DIR", "/app"))
+    candidate = pixi_project_dir / ".pixi" / "envs" / pixi_env / "bin" / "python"
+    if candidate.is_file() and os.access(candidate, os.X_OK):
+        return str(candidate)
+    return None
+
+
 def main(args: argparse.Namespace):
     """
     Main pipeline for running grid search trials.
diff --git a/src/sampleworks/models/boltz/wrapper.py b/src/sampleworks/models/boltz/wrapper.py
index c257511e..4efbaecc 100644
--- a/src/sampleworks/models/boltz/wrapper.py
+++ b/src/sampleworks/models/boltz/wrapper.py
@@ -320,7 +320,7 @@ class BoltzConfig:
     """
 
     out_dir: str | Path | None = None
-    num_workers: int = 8
+    num_workers: int = 0
     ensemble_size: int = 1
     recycling_steps: int = 3
 
@@ -329,7 +329,7 @@ def process_structure_for_boltz(
     structure: dict,
     *,
     out_dir: str | Path | None = None,
-    num_workers: int = 8,
+    num_workers: int = 0,
     ensemble_size: int = 1,
     recycling_steps: int | None = 3,
 ) -> dict:
@@ -360,6 +360,9 @@ def process_structure_for_boltz(
     if recycling_steps is None:
         recycling_steps = 3
 
+    # Keep Boltz dataloading in-process by default. Kubernetes pods usually get
+    # a small /dev/shm, and torch DataLoader workers can exhaust it while
+    # sharing large featurized batches back to the parent process.
     config = BoltzConfig(
         out_dir=out_dir or structure.get("metadata", {}).get("id", "boltz_output"),
         num_workers=num_workers,
@@ -567,7 +570,7 @@ def _setup_data_module(
         self,
         input_path: str | Path,
         out_dir: str | Path,
-        num_workers: int = 8,
+        num_workers: int = 0,
     ):
         """Create the Lightning data module used by Boltz to serve data to the model.
 
@@ -628,7 +631,7 @@ def _setup_data_module(
             target_dir=processed.targets_dir,
             msa_dir=processed.msa_dir,
             mol_dir=mol_dir,
-            num_workers=num_workers if num_workers is not None else 8,
+            num_workers=num_workers if num_workers is not None else 0,
             constraints_dir=processed.constraints_dir,
             template_dir=processed_dir / "templates"
             if (processed_dir / "templates").exists()
@@ -1032,7 +1035,7 @@ def _setup_data_module(
         self,
         input_path: str | Path,
         out_dir: str | Path,
-        num_workers: int = 2,
+        num_workers: int = 0,
     ):
         """Create the Lightning data module used by Boltz to serve data to the model.
 
@@ -1090,7 +1093,7 @@ def _setup_data_module(
             manifest=processed.manifest,
             target_dir=processed.targets_dir,
             msa_dir=processed.msa_dir,
-            num_workers=num_workers if num_workers is not None else 2,
+            num_workers=num_workers if num_workers is not None else 0,
             constraints_dir=processed.constraints_dir,
         )
 
diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml
index 72701997..a00461fb 100644
--- a/src/sampleworks/runs/presets/all_models.toml
+++ b/src/sampleworks/runs/presets/all_models.toml
@@ -1,7 +1,7 @@
 description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)."
 
 [defaults]
-DATA_DIR = "/data/input"
+DATA_DIR = "/data/inputs"
 RESULTS_DIR = "/data/results/all_models"
 MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 6a5c6fbb..72ca9bbf 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -15,7 +15,7 @@
 from .schema import Job, Preset
 
 
-GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
+DEFAULT_GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
 
 
 @dataclass(frozen=True)
@@ -68,7 +68,7 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
         args = preset.effective_args(job)
         args.setdefault("output-dir", str(results_dir / job.output_subdir))
         argv = _build_argv(job.env, args)
-        env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus}
+        env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus})
         log_path = results_dir / f"{job.name}_run.log"
         output_dir = Path(args["output-dir"])
         invocations.append(
@@ -95,7 +95,11 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
     list of str
         Subprocess argv.
     """
-    argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT]
+    env_python = _pixi_env_python(pixi_env)
+    if env_python:
+        argv = [env_python, _grid_search_script()]
+    else:
+        argv = ["pixi", "run", "-e", pixi_env, "python", _grid_search_script()]
     for key, value in args.items():
         flag = f"--{key}"
         if isinstance(value, bool):
@@ -108,6 +112,101 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
     return argv
 
 
+def _pixi_env_python(pixi_env: str) -> str | None:
+    """Return the direct Python binary for a baked pixi environment when available.
+
+    The sampleworks ACTL image already contains fully-installed environments at
+    ``/app/.pixi/envs/<env>``. Calling those Python binaries directly avoids
+    ``pixi run`` trying to refresh Git/PyPI caches on shared pod storage.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Pixi environment name from the preset job.
+
+    Returns
+    -------
+    str or None
+        Executable Python path, or ``None`` to fall back to ``pixi run``.
+    """
+    if os.environ.get("SAMPLEWORKS_FORCE_PIXI", "").lower() in {"1", "true", "yes"}:
+        return None
+
+    env_key = pixi_env.upper().replace("-", "_")
+    override = os.environ.get(f"SAMPLEWORKS_{env_key}_PYTHON")
+    if override:
+        return override
+
+    candidate = _pixi_project_dir() / ".pixi" / "envs" / pixi_env / "bin" / "python"
+    if candidate.is_file() and os.access(candidate, os.X_OK):
+        return str(candidate)
+    return None
+
+
+def _job_env(pixi_env: str, env: dict[str, str]) -> dict[str, str]:
+    """Return an environment equivalent to activating a direct pixi env.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Pixi environment name used by the job.
+    env : dict of str to str
+        Base process environment.
+
+    Returns
+    -------
+    dict of str to str
+        Environment with the pixi env's ``bin`` directory and compiler/CUDA
+        paths exposed when the job runs a direct Python binary.
+    """
+    env_python = _pixi_env_python(pixi_env)
+    if env_python is None:
+        return env
+
+    env_dir = Path(env_python).resolve().parent.parent
+    bin_dir = env_dir / "bin"
+    activated = dict(env)
+    activated["PATH"] = f"{bin_dir}{os.pathsep}{activated.get('PATH', '')}"
+    activated["CONDA_PREFIX"] = str(env_dir)
+    activated.setdefault("CUDA_HOME", str(env_dir))
+    activated["PYTHONNOUSERSITE"] = "1"
+    return activated
+
+
+def _pixi_project_dir() -> Path:
+    """Return the pixi project directory for env lookup and fallback pixi runs.
+
+    Returns
+    -------
+    Path
+        Project directory, defaulting to ``/app`` for the sampleworks image or
+        the current working directory outside that image.
+    """
+    override = os.environ.get("SAMPLEWORKS_PIXI_PROJECT_DIR")
+    if override:
+        return Path(override)
+    app = Path("/app")
+    if (app / "pyproject.toml").exists():
+        return app
+    return Path.cwd()
+
+
+def _grid_search_script() -> str:
+    """Return the ``run_grid_search.py`` path used by worker jobs.
+
+    Resolution is intentionally simple for the ACTL sampleworks image: the
+    baked image keeps a stable copy at :data:`DEFAULT_GRID_SEARCH_SCRIPT`, while
+    synced PR worktrees can point the runner at their checkout with
+    ``SAMPLEWORKS_GRID_SEARCH_SCRIPT=/home/dev/workspace/run_grid_search.py``.
+
+    Returns
+    -------
+    str
+        Path to execute with ``python`` inside each pixi environment.
+    """
+    return os.environ.get("SAMPLEWORKS_GRID_SEARCH_SCRIPT", DEFAULT_GRID_SEARCH_SCRIPT)
+
+
 def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
     """Launch every job in parallel and wait for completion.
 
@@ -137,6 +236,11 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
             _print_dry_run(inv)
         return 0
 
+    pixi_envs = sorted({inv.job.env for inv in invocations})
+    for pixi_env in pixi_envs:
+        _prepare_pixi_env(pixi_env)
+    invocations = build_invocations(preset, results_dir=results_dir)
+
     _print_launch_summary(preset, invocations)
     processes: list[_RunningJob] = []
     try:
@@ -165,6 +269,43 @@ def _terminate_all(jobs: list[_RunningJob]) -> None:
         j.tee_thread.join()
 
 
+def _prepare_pixi_env(pixi_env: str) -> None:
+    """Prepare a pixi environment before parallel job launch.
+
+    ``pixi run`` is deliberately called once per env even when the interpreter
+    directory already exists, because pixi may still need to materialize PyPI
+    packages into that environment after image startup.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Pixi environment to prepare.
+
+    Raises
+    ------
+    subprocess.CalledProcessError
+        If pixi cannot prepare the environment.
+    """
+    if os.environ.get("SAMPLEWORKS_SKIP_ENV_PREPARE", "").lower() in {
+        "1",
+        "true",
+        "yes",
+    }:
+        return
+
+    env = {
+        **os.environ,
+        "PIXI_CACHE_DIR": os.environ.get("PIXI_CACHE_DIR", "/tmp/pixi-cache"),
+        "UV_CACHE_DIR": os.environ.get("UV_CACHE_DIR", "/tmp/uv-cache"),
+    }
+    cmd = ["pixi", "run", "-e", pixi_env, "python", "-c", "print('ready')"]
+    print(
+        f"[{_ts()}] preparing pixi env {pixi_env!r} with {shlex.join(cmd)}",
+        file=sys.stderr,
+    )
+    subprocess.run(cmd, cwd=str(_pixi_project_dir()), env=env, check=True)
+
+
 def _print_dry_run(inv: JobInvocation) -> None:
     """Print the exact command for one job without launching it.
 
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index 6e1b768a..b57a84e9 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -1,7 +1,7 @@
 """Dataclasses for the preset schema.
 
 A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
-is launched as ``pixi run -e <env> python /app/run_grid_search.py <args>`` with
+is launched as ``pixi run -e <env> python <run_grid_search.py> <args>`` with
 ``CUDA_VISIBLE_DEVICES`` set to the job's GPU assignment.
 """
 
diff --git a/src/sampleworks/utils/guidance_script_arguments.py b/src/sampleworks/utils/guidance_script_arguments.py
index 9c2601fa..2876c06e 100644
--- a/src/sampleworks/utils/guidance_script_arguments.py
+++ b/src/sampleworks/utils/guidance_script_arguments.py
@@ -9,16 +9,34 @@
 from sampleworks.utils.guidance_constants import GuidanceType, StructurePredictor
 
 
-# Baked-in checkpoint paths (Docker image) with legacy fallbacks
+# Baked-in checkpoint paths (Docker image), ACTL shared-storage paths, and
+# legacy fallbacks. Environment variables win when present.
+_CHECKPOINT_ENV_VARS = {
+    "boltz1": "BOLTZ1_CHECKPOINT",
+    "boltz2": "BOLTZ2_CHECKPOINT",
+    "rf3": "RF3_CHECKPOINT",
+    "protenix": "PROTENIX_CHECKPOINT",
+}
+
 _CHECKPOINT_CANDIDATES = {
-    "boltz1": ["/checkpoints/boltz1_conf.ckpt", "~/.boltz/boltz1_conf.ckpt"],
-    "boltz2": ["/checkpoints/boltz2_conf.ckpt", "~/.boltz/boltz2_conf.ckpt"],
+    "boltz1": [
+        "/checkpoints/boltz1_conf.ckpt",
+        "/mnt/diffuse-shared/raw/checkpoints/boltz1_conf.ckpt",
+        "~/.boltz/boltz1_conf.ckpt",
+    ],
+    "boltz2": [
+        "/checkpoints/boltz2_conf.ckpt",
+        "/mnt/diffuse-shared/raw/checkpoints/boltz2_conf.ckpt",
+        "~/.boltz/boltz2_conf.ckpt",
+    ],
     "rf3": [
         "/checkpoints/rf3_foundry_01_24_latest.ckpt",
+        "/mnt/diffuse-shared/raw/checkpoints/rf3_foundry_01_24_latest.ckpt",
         "~/.foundry/checkpoints/rf3_foundry_01_24_latest.ckpt",
     ],
     "protenix": [
         "/checkpoints/protenix_base_default_v0.5.0.pt",
+        "/mnt/diffuse-shared/raw/checkpoints/protenix_base_default_v0.5.0.pt",
         ".pixi/envs/protenix-dev/lib/python3.12/site-packages/release_data/checkpoint/protenix_base_default_v0.5.0.pt",
     ],
 }
@@ -31,7 +49,11 @@ def _resolve_checkpoint(model_key: str) -> str:
     legacy development paths.  If none are found the first candidate is returned
     so that downstream validation produces a clear error message.
     """
-    candidates = _CHECKPOINT_CANDIDATES.get(model_key, [])
+    env_var = _CHECKPOINT_ENV_VARS.get(model_key)
+    candidates = []
+    if env_var and os.environ.get(env_var):
+        candidates.append(os.environ[env_var])
+    candidates.extend(_CHECKPOINT_CANDIDATES.get(model_key, []))
     for candidate in candidates:
         resolved = Path(candidate).expanduser()
         if resolved.exists():
@@ -45,9 +67,10 @@ def _resolve_checkpoint(model_key: str) -> str:
             f"Provide --model-checkpoint or bake checkpoints into /checkpoints/."
         )
     if not Path(resolved).exists():
+        env_hint = _CHECKPOINT_ENV_VARS.get(model_key, "a checkpoint env var")
         raise ValueError(
-            f"Model checkpoint '{resolved}' does not exist. "
-            f"Provide a valid path via --model-checkpoint."
+            f"Model checkpoint for '{model_key}' was not found. Checked: {candidates}. "
+            f"Provide --model-checkpoint or set {env_hint}."
         )
 
     return resolved
diff --git a/src/sampleworks/utils/guidance_script_utils.py b/src/sampleworks/utils/guidance_script_utils.py
index a72492ee..5d26a7c6 100644
--- a/src/sampleworks/utils/guidance_script_utils.py
+++ b/src/sampleworks/utils/guidance_script_utils.py
@@ -459,8 +459,14 @@ def _run_guidance(args: GuidanceConfig, guidance_type: str, model_wrapper, devic
     elif "Boltz" in wrapper_class_name:
         from sampleworks.models.boltz.wrapper import process_structure_for_boltz
 
+        # Boltz preprocessing writes manifest/NPZ/MSA files as a side effect.
+        # Keep those under the per-job output directory so concurrent grid jobs
+        # for the same protein do not race on a shared metadata-derived path.
         structure = process_structure_for_boltz(
-            structure, ensemble_size=args.ensemble_size, recycling_steps=recycling_steps
+            structure,
+            out_dir=args.output_dir,
+            ensemble_size=args.ensemble_size,
+            recycling_steps=recycling_steps,
         )
     else:
         raise ValueError(f"Unknown model wrapper class: {wrapper_class_name}")
diff --git a/tests/runs/conftest.py b/tests/runs/conftest.py
new file mode 100644
index 00000000..a20482c0
--- /dev/null
+++ b/tests/runs/conftest.py
@@ -0,0 +1,11 @@
+"""Shared test fixtures for preset-runner tests."""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def force_pixi_argv(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Keep argv assertions deterministic on machines with /app/.pixi present."""
+    monkeypatch.setenv("SAMPLEWORKS_FORCE_PIXI", "1")
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 6528a602..7a2bb1b1 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -43,6 +43,15 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
     assert preset.defaults["DATA_DIR"] == "/data/inputs"
 
 
+def test_all_models_uses_canonical_inputs_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+    """The flagship preset must use /data/inputs, matching the ACTL wrapper."""
+    monkeypatch.delenv("DATA_DIR", raising=False)
+    monkeypatch.setenv("HOME", "/home/test")
+    preset = loader.load_preset("all_models")
+    assert preset.defaults["DATA_DIR"] == "/data/inputs"
+    assert preset.shared_args["proteins"] == "/data/inputs/proteins.csv"
+
+
 def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 61c5b758..040ea0c1 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -100,6 +100,40 @@ def test_build_invocations_records_output_dir(monkeypatch: pytest.MonkeyPatch) -
     assert inv.output_dir == Path("/r/rf3")
 
 
+def test_grid_search_script_can_be_overridden(monkeypatch: pytest.MonkeyPatch) -> None:
+    """ACTL wrappers can run the synced checkout instead of the baked /app copy."""
+    monkeypatch.setenv("HOME", "/home/test")
+    monkeypatch.setenv("SAMPLEWORKS_GRID_SEARCH_SCRIPT", "/home/dev/workspace/run_grid_search.py")
+    preset = loader.load_preset("rf3_partial")
+    inv = runner.build_invocations(preset, results_dir=Path("/r"))[0]
+    assert inv.argv[:6] == [
+        "pixi",
+        "run",
+        "-e",
+        "rf3",
+        "python",
+        "/home/dev/workspace/run_grid_search.py",
+    ]
+
+
+def test_uses_baked_env_python_when_available(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """ACTL image runs bypass pixi cache refreshes by calling env Python directly."""
+    monkeypatch.delenv("SAMPLEWORKS_FORCE_PIXI", raising=False)
+    monkeypatch.setenv("HOME", "/home/test")
+    pixi_project = tmp_path / "app"
+    python_bin = pixi_project / ".pixi" / "envs" / "rf3" / "bin" / "python"
+    python_bin.parent.mkdir(parents=True)
+    python_bin.write_text("#!/bin/sh\n")
+    python_bin.chmod(0o755)
+    monkeypatch.setenv("SAMPLEWORKS_PIXI_PROJECT_DIR", str(pixi_project))
+
+    preset = loader.load_preset("rf3_partial")
+    inv = runner.build_invocations(preset, results_dir=Path("/r"))[0]
+    assert inv.argv[:2] == [str(python_bin), "/app/run_grid_search.py"]
+
+
 def test_dry_run_does_not_create_directories(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From 6869dcbde735cc3313fc98abba0ab62921fe1c8c Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Thu, 21 May 2026 15:28:17 -0400
Subject: [PATCH 10/28] fix(runs): align experiment entrypoint and boltz tests

---
 Dockerfile                               |   8 +-
 GRID_SEARCH.md                           |   2 +-
 README.md                                |  44 ++-----
 run_all_models.sh                        | 132 +--------------------
 run_experiments                          | 145 +++++++++++++++++++++++
 run_experiments.sh                       |  17 +++
 tests/models/boltz/test_boltz_wrapper.py |   4 +-
 7 files changed, 183 insertions(+), 169 deletions(-)
 create mode 100755 run_experiments
 create mode 100755 run_experiments.sh

diff --git a/Dockerfile b/Dockerfile
index c7ed94d7..f6d8f495 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -129,10 +129,12 @@ RUN pixi run -e boltz python -c "\
 from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \
 print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)"
 
-COPY run_all_models.sh ./
-RUN chmod +x /app/run_all_models.sh \
+COPY run_experiments run_experiments.sh run_all_models.sh ./
+RUN chmod +x /app/run_experiments /app/run_experiments.sh /app/run_all_models.sh \
+    && printf '#!/usr/bin/env bash\nexec /app/run_experiments "$@"\n' > /usr/local/bin/run_experiments \
+    && printf '#!/usr/bin/env bash\nexec /app/run_experiments.sh "$@"\n' > /usr/local/bin/run_experiments.sh \
     && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \
-    && chmod +x /usr/local/bin/run_all_models.sh \
+    && chmod +x /usr/local/bin/run_experiments /usr/local/bin/run_experiments.sh /usr/local/bin/run_all_models.sh \
     && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n    cd /app\nfi\n' >> /root/.bashrc
 
 # Set default checkpoint paths via environment variables
diff --git a/GRID_SEARCH.md b/GRID_SEARCH.md
index 154d00c5..4ae15630 100644
--- a/GRID_SEARCH.md
+++ b/GRID_SEARCH.md
@@ -6,7 +6,7 @@ and how to find and read logs if you need to debug the process.
 
 ## Optional: Setting up the docker container
 It is often useful to have a docker container with all the dependencies installed.
-Our script `run_all_models.sh` for instance uses a docker container to manage all
+Our script `run_experiments` for instance uses a docker container to manage all
 dependencies. To run that script, you will need to have docker installed. Build
 the container with
 ```shell
diff --git a/README.md b/README.md
index dade48e7..73c4783b 100644
--- a/README.md
+++ b/README.md
@@ -152,53 +152,31 @@ Output layout: `grid_search_results/<protein>/<model>[_<method>]/<scaler>/ens<N>
 Instructions for running evaluation and metrics scripts are coming soon.
 
 
-## ACTL preset experiments (`run_all_models.sh` / `sampleworks-runs`)
+## ACTL preset experiments (`run_experiments`)
 
-For canonical multi-model/multi-GPU sweeps, `sampleworks-runs` orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares the model, pixi env, GPU assignment, output subdir, and CLI args. The runner launches jobs in parallel, tees per-job logs, and aggregates exit codes.
-
-On ACTL, start the pod with the prebuilt image and shared storage, then run one command inside the pod shell:
+Use ACTL to get a ready-to-run Sampleworks pod with 8 GPUs and the shared data PVC:
 
 ```bash
 actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
-
-# inside the ACTL pod shell
-# the sampleworks image drops interactive shells in /app
-run_all_models.sh --dry-run   # inspect commands first
-run_all_models.sh             # run /app/src/sampleworks/runs/presets/all_models.toml
 ```
 
-The wrapper keeps the TOML preset as the source of truth. It only supplies ACTL-friendly defaults:
-
-- `DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps`
-- `RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks/<pod>/all_models`
-- `MSA_CACHE_DIR=/mnt/diffuse-shared/cache/sampleworks/msa`
-- `PYTHONPATH=/app/src`, using the copy baked into the sampleworks image
-- direct `/app/.pixi/envs/<env>/bin/python` execution, so it reuses the environments baked into the sampleworks image without refreshing pixi caches
-- `/tmp` pixi/uv caches for any missing environment preparation, avoiding shared-storage Git cache issues
-
-Common commands:
+Inside the pod shell (`/app`), run:
 
 ```bash
-run_all_models.sh --list                         # bundled presets
-run_all_models.sh all_models --show              # inspect resolved values
-run_all_models.sh all_models --only rf3,protenix # subset jobs
-run_all_models.sh rf3_partial                    # run a smaller preset
-
-# Override paths or parameters without editing TOML:
-DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/my_dataset run_all_models.sh rf3_partial
-run_all_models.sh rf3_partial \
-    --set jobs.rf3.gpus=0 \
-    --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
+run_experiments --dry-run
+run_experiments all_models
 ```
 
-Bundled presets live in `src/sampleworks/runs/presets/*.toml`. You can also copy one, edit it, and run it by path:
+`run_experiments` is a thin wrapper around `sampleworks-runs`: it reads TOML presets and launches the requested `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job. The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2 MD, RF3, and Protenix.
+
+Presets live in `/app/src/sampleworks/runs/presets/*.toml` (same path in the repo: `src/sampleworks/runs/presets/`). To change an experiment, either edit/copy a preset or override values at launch:
 
 ```bash
-cp src/sampleworks/runs/presets/all_models.toml my_experiment.toml
-run_all_models.sh ./my_experiment.toml
+run_experiments all_models --only rf3,protenix
+run_experiments rf3_partial --set jobs.rf3.gpus=0
 ```
 
-Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block.
+The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks/<pod>/<preset>/`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations. `run_all_models.sh` remains as a compatibility alias.
 
 
 ## Docker
diff --git a/run_all_models.sh b/run_all_models.sh
index 96b032b0..f1f81b8c 100755
--- a/run_all_models.sh
+++ b/run_all_models.sh
@@ -1,10 +1,5 @@
 #!/usr/bin/env bash
-# ACTL-native entry point for Sampleworks preset runs.
-#
-# The TOML preset is the source of truth. This wrapper only supplies smooth
-# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH,
-# and direct use of the prebuilt pixi environments from the image at /app.
-
+# Backward-compatible alias. Prefer run_experiments for new docs/usage.
 set -euo pipefail
 
 script_path="${BASH_SOURCE[0]}"
@@ -18,128 +13,5 @@ while [[ -L "$script_path" ]]; do
     fi
 done
 script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
-repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}"
-
-preset="${SAMPLEWORKS_PRESET:-all_models}"
-if [[ $# -gt 0 && "$1" != -* ]]; then
-    preset="$1"
-    shift
-fi
-
-if [[ "$preset" == *.toml || "$preset" == */* ]]; then
-    if [[ "$preset" != /* ]]; then
-        preset="$repo_root/$preset"
-    fi
-fi
-preset_label="${preset##*/}"
-preset_label="${preset_label%.toml}"
-
-run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}"
-default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}"
-default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa"
-
-export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}"
-export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}"
-export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}"
-export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}"
-export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
-export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}"
-export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
-
-shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
-for checkpoint_var_and_file in \
-    "BOLTZ1_CHECKPOINT boltz1_conf.ckpt" \
-    "BOLTZ2_CHECKPOINT boltz2_conf.ckpt" \
-    "RF3_CHECKPOINT rf3_foundry_01_24_latest.ckpt" \
-    "PROTENIX_CHECKPOINT protenix_base_default_v0.5.0.pt"; do
-    read -r checkpoint_var checkpoint_file <<<"$checkpoint_var_and_file"
-    checkpoint_path="$shared_checkpoint_dir/$checkpoint_file"
-    if [[ -z "${!checkpoint_var:-}" && -f "$checkpoint_path" ]]; then
-        export "$checkpoint_var=$checkpoint_path"
-    fi
-done
-
-source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
-if [[ -f "$source_proteins_csv" ]]; then
-    # The shared proteins.csv currently contains absolute /data/inputs paths,
-    # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
-    # manifest instead of requiring non-root scientists to create /data symlinks.
-    manifest_dir="$RESULTS_DIR/_input_manifest"
-    manifest_proteins_csv="$manifest_dir/proteins.csv"
-    mkdir -p "$manifest_dir"
-    legacy_data_dir="/data/inputs"
-    while IFS= read -r line || [[ -n "$line" ]]; do
-        printf '%s\n' "${line//$legacy_data_dir/$DATA_DIR}"
-    done <"$source_proteins_csv" >"$manifest_proteins_csv"
-    export PROTEINS_CSV="$manifest_proteins_csv"
-fi
-
-runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}"
-pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}"
-if [[ -z "$pixi_project_dir" ]]; then
-    if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then
-        pixi_project_dir="/app"
-    else
-        pixi_project_dir="$repo_root"
-    fi
-fi
-runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}"
-
-needs_runtime_paths=1
-for arg in "$@"; do
-    case "$arg" in
-        --dry-run|--show|--list|-h|--help)
-            needs_runtime_paths=0
-            ;;
-    esac
-done
-
-if [[ "$needs_runtime_paths" -eq 1 ]]; then
-    if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then
-        cat >&2 <<EOF
-Sampleworks input dataset was not found.
-
-Expected: $source_proteins_csv
-
-On an ACTL sampleworks pod, make sure the diffuse-shared PVC is mounted at
-/mnt/diffuse-shared, or override the dataset path, for example:
-
-  DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/<dataset> ./run_all_models.sh
-
-EOF
-        exit 2
-    fi
-    mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR"
-fi
-
-cat >&2 <<EOF
-Sampleworks preset run
-  preset:        $preset
-  data:          $DATA_DIR
-  results:       $RESULTS_DIR
-  msa cache:     $MSA_CACHE_DIR
-  source:        $repo_root
-  pixi project:  $pixi_project_dir
-  runner env:    $runner_env
-  runner python: $runner_python
-
-EOF
-
-cd "$pixi_project_dir"
-if [[ -x "$runner_python" ]]; then
-    runner_env_dir="$(cd -- "$(dirname -- "$runner_python")/.." && pwd)"
-    export PATH="$runner_env_dir/bin${PATH:+:$PATH}"
-    export CONDA_PREFIX="$runner_env_dir"
-    export CUDA_HOME="${CUDA_HOME:-$runner_env_dir}"
-    export PYTHONNOUSERSITE=1
-    exec "$runner_python" -m sampleworks.runs.cli \
-        "$preset" \
-        --results-dir "$RESULTS_DIR" \
-        "$@"
-fi
 
-exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
-    "$preset" \
-    --results-dir "$RESULTS_DIR" \
-    "$@"
+exec "$script_dir/run_experiments" "$@"
diff --git a/run_experiments b/run_experiments
new file mode 100755
index 00000000..5ca3e0f4
--- /dev/null
+++ b/run_experiments
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# ACTL-native entry point for Sampleworks preset runs.
+#
+# The TOML preset is the source of truth. This wrapper only supplies smooth
+# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH,
+# and direct use of the prebuilt pixi environments from the image at /app.
+
+set -euo pipefail
+
+script_path="${BASH_SOURCE[0]}"
+while [[ -L "$script_path" ]]; do
+    script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+    script_target="$(readlink "$script_path")"
+    if [[ "$script_target" == /* ]]; then
+        script_path="$script_target"
+    else
+        script_path="$script_dir/$script_target"
+    fi
+done
+script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}"
+
+preset="${SAMPLEWORKS_PRESET:-all_models}"
+if [[ $# -gt 0 && "$1" != -* ]]; then
+    preset="$1"
+    shift
+fi
+
+if [[ "$preset" == *.toml || "$preset" == */* ]]; then
+    if [[ "$preset" != /* ]]; then
+        preset="$repo_root/$preset"
+    fi
+fi
+preset_label="${preset##*/}"
+preset_label="${preset_label%.toml}"
+
+run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}"
+default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
+default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}"
+default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa"
+
+export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}"
+export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}"
+export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}"
+export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}"
+export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
+export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}"
+export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
+
+shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
+for checkpoint_var_and_file in \
+    "BOLTZ1_CHECKPOINT boltz1_conf.ckpt" \
+    "BOLTZ2_CHECKPOINT boltz2_conf.ckpt" \
+    "RF3_CHECKPOINT rf3_foundry_01_24_latest.ckpt" \
+    "PROTENIX_CHECKPOINT protenix_base_default_v0.5.0.pt"; do
+    read -r checkpoint_var checkpoint_file <<<"$checkpoint_var_and_file"
+    checkpoint_path="$shared_checkpoint_dir/$checkpoint_file"
+    if [[ -z "${!checkpoint_var:-}" && -f "$checkpoint_path" ]]; then
+        export "$checkpoint_var=$checkpoint_path"
+    fi
+done
+
+source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
+if [[ -f "$source_proteins_csv" ]]; then
+    # The shared proteins.csv currently contains absolute /data/inputs paths,
+    # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
+    # manifest instead of requiring non-root scientists to create /data symlinks.
+    manifest_dir="$RESULTS_DIR/_input_manifest"
+    manifest_proteins_csv="$manifest_dir/proteins.csv"
+    mkdir -p "$manifest_dir"
+    legacy_data_dir="/data/inputs"
+    while IFS= read -r line || [[ -n "$line" ]]; do
+        printf '%s\n' "${line//$legacy_data_dir/$DATA_DIR}"
+    done <"$source_proteins_csv" >"$manifest_proteins_csv"
+    export PROTEINS_CSV="$manifest_proteins_csv"
+fi
+
+runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}"
+pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}"
+if [[ -z "$pixi_project_dir" ]]; then
+    if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then
+        pixi_project_dir="/app"
+    else
+        pixi_project_dir="$repo_root"
+    fi
+fi
+runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}"
+
+needs_runtime_paths=1
+for arg in "$@"; do
+    case "$arg" in
+        --dry-run|--show|--list|-h|--help)
+            needs_runtime_paths=0
+            ;;
+    esac
+done
+
+if [[ "$needs_runtime_paths" -eq 1 ]]; then
+    if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then
+        cat >&2 <<EOF
+Sampleworks input dataset was not found.
+
+Expected: $source_proteins_csv
+
+On an ACTL sampleworks pod, make sure the diffuse-shared PVC is mounted at
+/mnt/diffuse-shared, or override the dataset path, for example:
+
+  DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/<dataset> ./run_experiments
+
+EOF
+        exit 2
+    fi
+    mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR"
+fi
+
+cat >&2 <<EOF
+Sampleworks preset run
+  preset:        $preset
+  data:          $DATA_DIR
+  results:       $RESULTS_DIR
+  msa cache:     $MSA_CACHE_DIR
+  source:        $repo_root
+  pixi project:  $pixi_project_dir
+  runner env:    $runner_env
+  runner python: $runner_python
+
+EOF
+
+cd "$pixi_project_dir"
+if [[ -x "$runner_python" ]]; then
+    runner_env_dir="$(cd -- "$(dirname -- "$runner_python")/.." && pwd)"
+    export PATH="$runner_env_dir/bin${PATH:+:$PATH}"
+    export CONDA_PREFIX="$runner_env_dir"
+    export CUDA_HOME="${CUDA_HOME:-$runner_env_dir}"
+    export PYTHONNOUSERSITE=1
+    exec "$runner_python" -m sampleworks.runs.cli \
+        "$preset" \
+        --results-dir "$RESULTS_DIR" \
+        "$@"
+fi
+
+exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
+    "$preset" \
+    --results-dir "$RESULTS_DIR" \
+    "$@"
diff --git a/run_experiments.sh b/run_experiments.sh
new file mode 100755
index 00000000..f4feba11
--- /dev/null
+++ b/run_experiments.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Backward-compatible alias for the extensionless run_experiments command.
+set -euo pipefail
+
+script_path="${BASH_SOURCE[0]}"
+while [[ -L "$script_path" ]]; do
+    script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+    script_target="$(readlink "$script_path")"
+    if [[ "$script_target" == /* ]]; then
+        script_path="$script_target"
+    else
+        script_path="$script_dir/$script_target"
+    fi
+done
+script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
+
+exec "$script_dir/run_experiments" "$@"
diff --git a/tests/models/boltz/test_boltz_wrapper.py b/tests/models/boltz/test_boltz_wrapper.py
index 47630314..94c7d085 100644
--- a/tests/models/boltz/test_boltz_wrapper.py
+++ b/tests/models/boltz/test_boltz_wrapper.py
@@ -112,7 +112,7 @@ def test_annotate_preserves_original_structure(
     def test_annotate_default_values(self, structure_6b8x: dict, temp_output_dir: Path):
         result = process_structure_for_boltz(structure_6b8x, out_dir=temp_output_dir)
         config = result["_boltz_config"]
-        assert config.num_workers == 8
+        assert config.num_workers == 0
         assert config.ensemble_size == 1
         assert config.recycling_steps == 3
 
@@ -142,7 +142,7 @@ class TestBoltzConfig:
     def test_boltz_config_default_values(self):
         config = BoltzConfig()
         assert config.out_dir is None
-        assert config.num_workers == 8
+        assert config.num_workers == 0
         assert config.ensemble_size == 1
         assert config.recycling_steps == 3
 

From e56d1f1199de7e0238f013a58b8d0f2b4a4962ee Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 11:11:38 -0400
Subject: [PATCH 11/28] fix(runs): prefer synced workspace source

---
 Dockerfile                     |  27 +++---
 GRID_SEARCH.md                 |   4 +-
 README.md                      |  13 ++-
 docker-entrypoint.sh           |  50 ++++++------
 run_experiments                |  59 ++++++++++++--
 run_grid_search.py             |  23 +++++-
 src/sampleworks/runs/runner.py | 145 +++++++++++++++++++++++++++++++--
 tests/runs/test_runner.py      |  37 +++++++++
 8 files changed, 303 insertions(+), 55 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f6d8f495..04e29ad9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,7 +5,7 @@
 # Checkpoints are baked into the image at /checkpoints/ via a pre-built base image.
 #
 # Build:
-#   docker build -t sampleworks .
+#   docker build -t pixi-with-checkpoints .
 #
 # CI builds pull checkpoints automatically from Harbor via:
 #   COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest
@@ -16,10 +16,10 @@
 #
 # Run examples:
 #   # Show help
-#   docker run sampleworks --help
+#   docker run pixi-with-checkpoints --help
 #
 #   # Run grid search with Boltz1 (checkpoint baked in)
-#   docker run --gpus all -v /data:/data sampleworks \
+#   docker run --gpus all -v /data:/data pixi-with-checkpoints \
 #     -e boltz run_grid_search.py \
 #     --proteins /data/proteins.csv \
 #     --models boltz1 \
@@ -33,7 +33,7 @@
 #     --align-to-input
 #
 #   # Run grid search with Boltz2 (checkpoint baked in)
-#   docker run --gpus all -v /data:/data sampleworks \
+#   docker run --gpus all -v /data:/data pixi-with-checkpoints \
 #     -e boltz run_grid_search.py \
 #     --proteins /data/proteins.csv \
 #     --models boltz2 \
@@ -45,7 +45,7 @@
 #     --use-tweedie
 #
 #   # Interactive shell
-#   docker run --gpus all -it sampleworks bash
+#   docker run --gpus all -it pixi-with-checkpoints bash
 #
 # Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest):
 #   /checkpoints/boltz1_conf.ckpt                   - Boltz1 model (~3.5GB)
@@ -129,13 +129,16 @@ RUN pixi run -e boltz python -c "\
 from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \
 print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)"
 
-COPY run_experiments run_experiments.sh run_all_models.sh ./
-RUN chmod +x /app/run_experiments /app/run_experiments.sh /app/run_all_models.sh \
-    && printf '#!/usr/bin/env bash\nexec /app/run_experiments "$@"\n' > /usr/local/bin/run_experiments \
-    && printf '#!/usr/bin/env bash\nexec /app/run_experiments.sh "$@"\n' > /usr/local/bin/run_experiments.sh \
-    && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \
-    && chmod +x /usr/local/bin/run_experiments /usr/local/bin/run_experiments.sh /usr/local/bin/run_all_models.sh \
-    && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n    cd /app\nfi\n' >> /root/.bashrc
+# This image carries pixi environments and checkpoints. Runtime source should
+# come from ACTL's synced checkout at /home/dev/workspace, not from stale code
+# baked into /app during image construction.
+RUN rm -rf /app/src /app/scripts /app/run_grid_search.py \
+    && mkdir -p /home/dev/workspace
+
+COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/
+RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n    cd /home/dev/workspace\nfi\n' >> /root/.bashrc
+
+ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app
 
 # Set default checkpoint paths via environment variables
 ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \
diff --git a/GRID_SEARCH.md b/GRID_SEARCH.md
index 4ae15630..b8f9aa73 100644
--- a/GRID_SEARCH.md
+++ b/GRID_SEARCH.md
@@ -10,10 +10,10 @@ Our script `run_experiments` for instance uses a docker container to manage all
 dependencies. To run that script, you will need to have docker installed. Build
 the container with
 ```shell
-docker build -t diffuseproject/sampleworks .
+docker build -t pixi-with-checkpoints .
 ```
 which will add an image to your local docker repository called
-`diffuseproject/sampleworks:latest`. The top of the `Dockerfile` contains
+`pixi-with-checkpoints:latest`. The top of the `Dockerfile` contains
 instructions on how to use the container as well. The container entrypoint
 (`docker-entrypoint`) is fairly generic and is used to call the `run_grid_search.py`
 script described below.
diff --git a/README.md b/README.md
index 73c4783b..07a95646 100644
--- a/README.md
+++ b/README.md
@@ -154,13 +154,15 @@ Instructions for running evaluation and metrics scripts are coming soon.
 
 ## ACTL preset experiments (`run_experiments`)
 
-Use ACTL to get a ready-to-run Sampleworks pod with 8 GPUs and the shared data PVC:
+Use ACTL to get a ready-to-run pod with baked pixi environments, checkpoints,
+and the shared data PVC:
 
 ```bash
-actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+actl pod up sampleworks-pr236 --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:cuda12.4-2026-05-21-pr240-workspace1 --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
 ```
 
-Inside the pod shell (`/app`), run:
+ACTL syncs your local checkout to `/home/dev/workspace`; interactive shells land
+there. Run experiments from that synced checkout, not from `/app`:
 
 ```bash
 run_experiments --dry-run
@@ -169,13 +171,16 @@ run_experiments all_models
 
 `run_experiments` is a thin wrapper around `sampleworks-runs`: it reads TOML presets and launches the requested `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job. The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2 MD, RF3, and Protenix.
 
-Presets live in `/app/src/sampleworks/runs/presets/*.toml` (same path in the repo: `src/sampleworks/runs/presets/`). To change an experiment, either edit/copy a preset or override values at launch:
+Presets live in the synced repo at `src/sampleworks/runs/presets/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
 
 ```bash
 run_experiments all_models --only rf3,protenix
 run_experiments rf3_partial --set jobs.rf3.gpus=0
 ```
 
+On smaller pods, make sure preset GPU IDs only reference visible pod GPUs
+(`0..N-1`). `run_experiments` fails fast if a preset requests unavailable GPUs.
+
 The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks/<pod>/<preset>/`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations. `run_all_models.sh` remains as a compatibility alias.
 
 
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 957c351d..0477a7dc 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -2,15 +2,15 @@
 # Sampleworks Docker Entrypoint
 #
 # Usage:
-#   docker run sampleworks -e <pixi_env> <script> [args...]
-#   docker run sampleworks -e boltz run_grid_search.py --proteins /data/proteins.csv ...
-#   docker run sampleworks bash  # interactive shell
+#   docker run pixi-with-checkpoints -e <pixi_env> <script> [args...]
+#   docker run pixi-with-checkpoints -e boltz run_grid_search.py --proteins /data/proteins.csv ...
+#   docker run pixi-with-checkpoints bash  # interactive shell
 #
 # Available pixi environments: boltz, protenix, rf3
 #
 # Examples:
 #   # Run grid search with RF3
-#   docker run --gpus all -v /data:/data sampleworks \
+#   docker run --gpus all -v /data:/data pixi-with-checkpoints \
 #     -e rf3 run_grid_search.py \
 #     --proteins /data/proteins.csv \
 #     --models rf3 \
@@ -28,9 +28,9 @@ show_help() {
 Sampleworks - Protein structure prediction with diffusion model guidance
 
 USAGE:
-    docker run --gpus all --shm-size=16g sampleworks -e <environment> <script> [arguments...]
-    docker run sampleworks bash
-    docker run sampleworks --help
+    docker run --gpus all --shm-size=16g pixi-with-checkpoints -e <environment> <script> [arguments...]
+    docker run pixi-with-checkpoints bash
+    docker run pixi-with-checkpoints --help
 
 IMPORTANT:
     Always use --shm-size=16g (or larger) to avoid shared memory errors with DataLoaders.
@@ -47,7 +47,7 @@ ENVIRONMENTS:
 
 EXAMPLES:
     # Run grid search with RF3 model
-    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
+    docker run --gpus all --shm-size=16g -v /data:/data pixi-with-checkpoints \
       -e rf3 run_grid_search.py \
       --proteins /data/proteins.csv \
       --models rf3 \
@@ -62,7 +62,7 @@ EXAMPLES:
       --rf3-checkpoint /data/checkpoints/rf3_foundry_01_24_latest.ckpt
 
     # Run grid search with Boltz1 model
-    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
+    docker run --gpus all --shm-size=16g -v /data:/data pixi-with-checkpoints \
       -e boltz run_grid_search.py \
       --proteins /data/proteins.csv \
       --models boltz1 \
@@ -74,7 +74,7 @@ EXAMPLES:
       --boltz1-checkpoint /data/checkpoints/boltz1_conf.ckpt
 
     # Run grid search with Boltz2 model
-    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
+    docker run --gpus all --shm-size=16g -v /data:/data pixi-with-checkpoints \
       -e boltz run_grid_search.py \
       --proteins /data/proteins.csv \
       --models boltz2 \
@@ -87,7 +87,7 @@ EXAMPLES:
       --boltz2-checkpoint /data/checkpoints/boltz2_conf.ckpt
 
     # Run grid search with Protenix model
-    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
+    docker run --gpus all --shm-size=16g -v /data:/data pixi-with-checkpoints \
       -e protenix run_grid_search.py \
       --proteins /data/proteins.csv \
       --models protenix \
@@ -99,10 +99,10 @@ EXAMPLES:
       --protenix-checkpoint /data/checkpoints/protenix_base_default_v0.5.0.pt
 
     # Interactive shell
-    docker run --gpus all --shm-size=16g -it sampleworks bash
+    docker run --gpus all --shm-size=16g -it pixi-with-checkpoints bash
 
     # Run a custom script
-    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
+    docker run --gpus all --shm-size=16g -v /data:/data pixi-with-checkpoints \
       -e boltz scripts/boltz2_pure_guidance.py \
       --structure /data/structure.cif \
       --density /data/density.ccp4 \
@@ -191,7 +191,7 @@ PROTEINS CSV FORMAT:
       2xyz,/data/structures/2xyz.cif,/data/maps/2xyz.mrc,1.8
 
 For full argument details, run:
-    docker run sampleworks -e boltz run_grid_search.py --help
+    docker run pixi-with-checkpoints -e boltz run_grid_search.py --help
 EOF
 }
 
@@ -201,7 +201,11 @@ if [ $# -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
     exit 0
 fi
 
-# Handle interactive shell
+# Handle scientist workflow helpers and interactive shells
+if [ "$1" = "run_experiments" ] || [ "$1" = "run_experiments.sh" ] || [ "$1" = "run_all_models.sh" ]; then
+    exec "$@"
+fi
+
 if [ "$1" = "bash" ] || [ "$1" = "sh" ]; then
     exec "$@"
 fi
@@ -222,9 +226,9 @@ while [[ $# -gt 0 ]]; do
         *)
             echo "Error: First argument must be -e <environment>, bash, or --help"
             echo ""
-            echo "Usage: docker run sampleworks -e <env> <script> [args...]"
-            echo "       docker run sampleworks bash"
-            echo "       docker run sampleworks --help"
+            echo "Usage: docker run pixi-with-checkpoints -e <env> <script> [args...]"
+            echo "       docker run pixi-with-checkpoints bash"
+            echo "       docker run pixi-with-checkpoints --help"
             exit 1
             ;;
     esac
@@ -234,12 +238,12 @@ done
 if [[ -z "$ENV" ]]; then
     echo "Error: Environment not specified. Use -e <env> where env is boltz, protenix, or rf3"
     echo ""
-    echo "Usage: docker run sampleworks -e <env> <script> [args...]"
+    echo "Usage: docker run pixi-with-checkpoints -e <env> <script> [args...]"
     echo ""
     echo "Examples:"
-    echo "  docker run sampleworks -e boltz run_grid_search.py --proteins /data/proteins.csv"
-    echo "  docker run sampleworks -e rf3 run_grid_search.py --help"
-    echo "  docker run sampleworks bash"
+    echo "  docker run pixi-with-checkpoints -e boltz run_grid_search.py --proteins /data/proteins.csv"
+    echo "  docker run pixi-with-checkpoints -e rf3 run_grid_search.py --help"
+    echo "  docker run pixi-with-checkpoints bash"
     exit 1
 fi
 
@@ -255,7 +259,7 @@ esac
 # Get the script to run
 if [[ $# -eq 0 ]]; then
     echo "Error: No script specified"
-    echo "Usage: docker run sampleworks -e <env> <script> [args...]"
+    echo "Usage: docker run pixi-with-checkpoints -e <env> <script> [args...]"
     exit 1
 fi
 
diff --git a/run_experiments b/run_experiments
index 5ca3e0f4..bed86c6a 100755
--- a/run_experiments
+++ b/run_experiments
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
 # ACTL-native entry point for Sampleworks preset runs.
 #
-# The TOML preset is the source of truth. This wrapper only supplies smooth
-# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH,
-# and direct use of the prebuilt pixi environments from the image at /app.
+# The TOML preset is the source of truth. This wrapper uses the synced ACTL
+# checkout at /home/dev/workspace for Sampleworks code, while reusing the
+# prebuilt pixi environments/checkpoints from the image at /app.
 
 set -euo pipefail
 
@@ -18,7 +18,56 @@ while [[ -L "$script_path" ]]; do
     fi
 done
 script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
-repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}"
+
+is_sampleworks_root() {
+    local candidate="$1"
+    [[ -f "$candidate/pyproject.toml" && -d "$candidate/src/sampleworks" && -f "$candidate/run_grid_search.py" ]]
+}
+
+find_sampleworks_root_upwards() {
+    local candidate="$1"
+    while [[ -n "$candidate" && "$candidate" != "/" ]]; do
+        if is_sampleworks_root "$candidate"; then
+            printf '%s\n' "$candidate"
+            return 0
+        fi
+        candidate="$(dirname -- "$candidate")"
+    done
+    return 1
+}
+
+resolve_repo_root() {
+    local override="${SAMPLEWORKS_SOURCE_DIR:-${SAMPLEWORKS_APP_DIR:-}}"
+    if [[ -n "$override" ]]; then
+        printf '%s\n' "$override"
+        return 0
+    fi
+
+    # ACTL syncs the user's local checkout here. Prefer it over any stale code
+    # that may have been used only to build the image's pixi environments.
+    if is_sampleworks_root "/home/dev/workspace"; then
+        printf '%s\n' "/home/dev/workspace"
+        return 0
+    fi
+
+    find_sampleworks_root_upwards "$PWD" && return 0
+
+    if is_sampleworks_root "$script_dir"; then
+        printf '%s\n' "$script_dir"
+        return 0
+    fi
+
+    cat >&2 <<'EOF'
+Could not find the synced Sampleworks checkout.
+
+Expected ACTL to sync the repo to /home/dev/workspace. If you are using a
+custom layout, set SAMPLEWORKS_SOURCE_DIR=/path/to/sampleworks before running
+run_experiments.
+EOF
+    return 2
+}
+
+repo_root="$(resolve_repo_root)"
 
 preset="${SAMPLEWORKS_PRESET:-all_models}"
 if [[ $# -gt 0 && "$1" != -* ]]; then
@@ -102,7 +151,7 @@ Sampleworks input dataset was not found.
 
 Expected: $source_proteins_csv
 
-On an ACTL sampleworks pod, make sure the diffuse-shared PVC is mounted at
+On an ACTL Sampleworks pod, make sure the diffuse-shared PVC is mounted at
 /mnt/diffuse-shared, or override the dataset path, for example:
 
   DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/<dataset> ./run_experiments
diff --git a/run_grid_search.py b/run_grid_search.py
index 189a04c4..cfef17cf 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -69,7 +69,28 @@ def get_job_status(job: JobConfig) -> str:
 def detect_gpus() -> list[str]:
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if cuda_visible:
-        return [g.strip() for g in cuda_visible.split(",") if g.strip()]
+        gpus = [g.strip() for g in cuda_visible.split(",") if g.strip()]
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode == 0:
+                visible = [
+                    g.strip() for g in result.stdout.strip().split("\n") if g.strip()
+                ]
+                if all(g.isdigit() for g in gpus + visible):
+                    missing = sorted(set(gpus).difference(visible), key=int)
+                    if missing:
+                        raise ValueError(
+                            "CUDA_VISIBLE_DEVICES references GPUs that are not visible "
+                            f"in this container: {missing}. Visible GPUs: {visible}. "
+                            "Check the preset jobs.*.gpus values for this pod size."
+                        )
+        except FileNotFoundError:
+            pass
+        return gpus
     try:
         result = subprocess.run(
             ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 72ca9bbf..0e7d3f57 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -16,6 +16,7 @@
 
 
 DEFAULT_GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
+WORKSPACE_GRID_SEARCH_SCRIPT = "/home/dev/workspace/run_grid_search.py"
 
 
 @dataclass(frozen=True)
@@ -77,6 +78,128 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
     return invocations
 
 
+def _split_gpu_list(value: str) -> list[str]:
+    """Split a comma-separated GPU assignment into normalized tokens.
+
+    Parameters
+    ----------
+    value : str
+        GPU assignment string such as ``"0,1"``.
+
+    Returns
+    -------
+    list of str
+        Non-empty stripped GPU tokens.
+    """
+    return [part.strip() for part in value.split(",") if part.strip()]
+
+
+def _all_integer_tokens(values: list[str]) -> bool:
+    """Return True when every GPU token is a CUDA ordinal.
+
+    Parameters
+    ----------
+    values : list of str
+        GPU tokens to inspect.
+
+    Returns
+    -------
+    bool
+        True if all tokens are non-negative integer strings.
+    """
+    return all(value.isdigit() for value in values)
+
+
+def _detect_available_gpus() -> list[str]:
+    """Return GPU ordinals visible to the runner process.
+
+    Returns
+    -------
+    list of str
+        Visible GPU identifiers, or an empty list when GPU discovery is not
+        available. Empty means validation should be skipped.
+    """
+    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+    if cuda_visible and cuda_visible.lower() not in {
+        "all",
+        "none",
+        "void",
+        "nodevfiles",
+    }:
+        return _split_gpu_list(cuda_visible)
+
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+    except FileNotFoundError:
+        return []
+    if result.returncode != 0:
+        return []
+    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
+
+
+def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
+    """Fail fast when a preset asks for GPUs not present in this pod.
+
+    Parameters
+    ----------
+    invocations : list of JobInvocation
+        Jobs whose ``CUDA_VISIBLE_DEVICES`` assignments should be checked.
+
+    Raises
+    ------
+    RuntimeError
+        If numeric preset assignments reference unavailable visible GPU IDs,
+        or if multiple jobs claim the same GPU without opting into
+        oversubscription.
+    """
+    available = _detect_available_gpus()
+    if not available:
+        return
+
+    requested: dict[str, list[str]] = {}
+    for inv in invocations:
+        for gpu in _split_gpu_list(inv.job.gpus):
+            requested.setdefault(gpu, []).append(inv.job.name)
+
+    requested_tokens = list(requested)
+    if not _all_integer_tokens(available) or not _all_integer_tokens(requested_tokens):
+        return
+
+    available_set = set(available)
+    unavailable = {
+        gpu: names for gpu, names in requested.items() if gpu not in available_set
+    }
+    if unavailable:
+        details = ", ".join(
+            f"GPU {gpu} requested by {', '.join(names)}"
+            for gpu, names in sorted(unavailable.items())
+        )
+        raise RuntimeError(
+            "Preset requests GPUs that are not visible in this pod. "
+            f"Visible GPUs: {', '.join(available)}. {details}. "
+            "Edit the preset's jobs.*.gpus values or run a smaller --only subset."
+        )
+
+    allow_oversubscription = os.environ.get(
+        "SAMPLEWORKS_ALLOW_GPU_OVERSUBSCRIPTION", ""
+    ).lower() in {"1", "true", "yes"}
+    duplicates = {gpu: names for gpu, names in requested.items() if len(names) > 1}
+    if duplicates and not allow_oversubscription:
+        details = ", ".join(
+            f"GPU {gpu} requested by {', '.join(names)}"
+            for gpu, names in sorted(duplicates.items())
+        )
+        raise RuntimeError(
+            "Preset assigns the same GPU to multiple jobs. "
+            f"{details}. Set SAMPLEWORKS_ALLOW_GPU_OVERSUBSCRIPTION=1 to allow this."
+        )
+
+
 def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
     """Assemble the ``pixi run`` argv list for one job's args dict.
 
@@ -115,7 +238,7 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
 def _pixi_env_python(pixi_env: str) -> str | None:
     """Return the direct Python binary for a baked pixi environment when available.
 
-    The sampleworks ACTL image already contains fully-installed environments at
+    The ACTL pixi/checkpoint image already contains fully-installed environments at
     ``/app/.pixi/envs/<env>``. Calling those Python binaries directly avoids
     ``pixi run`` trying to refresh Git/PyPI caches on shared pod storage.
 
@@ -179,8 +302,8 @@ def _pixi_project_dir() -> Path:
     Returns
     -------
     Path
-        Project directory, defaulting to ``/app`` for the sampleworks image or
-        the current working directory outside that image.
+        Project directory, defaulting to ``/app`` for the ACTL pixi/checkpoint
+        image or the current working directory outside that image.
     """
     override = os.environ.get("SAMPLEWORKS_PIXI_PROJECT_DIR")
     if override:
@@ -194,17 +317,22 @@ def _pixi_project_dir() -> Path:
 def _grid_search_script() -> str:
     """Return the ``run_grid_search.py`` path used by worker jobs.
 
-    Resolution is intentionally simple for the ACTL sampleworks image: the
-    baked image keeps a stable copy at :data:`DEFAULT_GRID_SEARCH_SCRIPT`, while
-    synced PR worktrees can point the runner at their checkout with
-    ``SAMPLEWORKS_GRID_SEARCH_SCRIPT=/home/dev/workspace/run_grid_search.py``.
+    Prefer the synced ACTL checkout when it exists; otherwise fall back to the
+    historical baked ``/app`` path or an explicit
+    ``SAMPLEWORKS_GRID_SEARCH_SCRIPT``.
 
     Returns
     -------
     str
         Path to execute with ``python`` inside each pixi environment.
     """
-    return os.environ.get("SAMPLEWORKS_GRID_SEARCH_SCRIPT", DEFAULT_GRID_SEARCH_SCRIPT)
+    override = os.environ.get("SAMPLEWORKS_GRID_SEARCH_SCRIPT")
+    if override:
+        return override
+    workspace_script = Path(WORKSPACE_GRID_SEARCH_SCRIPT)
+    if workspace_script.exists():
+        return str(workspace_script)
+    return DEFAULT_GRID_SEARCH_SCRIPT
 
 
 def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
@@ -230,6 +358,7 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
     """
     results_dir.mkdir(parents=True, exist_ok=True)
     invocations = build_invocations(preset, results_dir=results_dir)
+    _validate_gpu_assignments(invocations)
 
     if dry_run:
         for inv in invocations:
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 040ea0c1..140acf2d 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -116,6 +116,43 @@ def test_grid_search_script_can_be_overridden(monkeypatch: pytest.MonkeyPatch) -
     ]
 
 
+def test_gpu_validation_rejects_unavailable_gpu_ids(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """A preset for 8 GPUs should fail clearly on a smaller pod."""
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: ["0", "1", "2", "3"])
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        "[shared_args]\n"
+        '[[jobs]]\nname = "ok"\nenv = "rf3"\ngpus = "0,1"\noutput_subdir = "ok"\n'
+        '[[jobs]]\nname = "bad"\nenv = "rf3"\ngpus = "4,5"\noutput_subdir = "bad"\n'
+    )
+    preset = loader.load_preset(str(custom))
+    invocations = runner.build_invocations(preset, results_dir=tmp_path / "results")
+
+    with pytest.raises(RuntimeError, match="not visible"):
+        runner._validate_gpu_assignments(invocations)
+
+
+def test_gpu_validation_rejects_duplicate_gpu_ids(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Accidental GPU oversubscription is caught before jobs launch."""
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: ["0", "1"])
+    monkeypatch.delenv("SAMPLEWORKS_ALLOW_GPU_OVERSUBSCRIPTION", raising=False)
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        "[shared_args]\n"
+        '[[jobs]]\nname = "a"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "a"\n'
+        '[[jobs]]\nname = "b"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "b"\n'
+    )
+    preset = loader.load_preset(str(custom))
+    invocations = runner.build_invocations(preset, results_dir=tmp_path / "results")
+
+    with pytest.raises(RuntimeError, match="same GPU"):
+        runner._validate_gpu_assignments(invocations)
+
+
 def test_uses_baked_env_python_when_available(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From dbcd4b71e1f890176dab929a06a33d30dcd5f51e Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 11:20:19 -0400
Subject: [PATCH 12/28] fix(runs): ignore legacy app source default

---
 Dockerfile      |  3 ++-
 run_experiments | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 04e29ad9..b7d65b8e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -138,7 +138,8 @@ RUN rm -rf /app/src /app/scripts /app/run_grid_search.py \
 COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/
 RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n    cd /home/dev/workspace\nfi\n' >> /root/.bashrc
 
-ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app
+ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app \
+    SAMPLEWORKS_APP_DIR=
 
 # Set default checkpoint paths via environment variables
 ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \
diff --git a/run_experiments b/run_experiments
index bed86c6a..2d1d81ec 100755
--- a/run_experiments
+++ b/run_experiments
@@ -37,9 +37,9 @@ find_sampleworks_root_upwards() {
 }
 
 resolve_repo_root() {
-    local override="${SAMPLEWORKS_SOURCE_DIR:-${SAMPLEWORKS_APP_DIR:-}}"
-    if [[ -n "$override" ]]; then
-        printf '%s\n' "$override"
+    local source_override="${SAMPLEWORKS_SOURCE_DIR:-}"
+    if [[ -n "$source_override" ]]; then
+        printf '%s\n' "$source_override"
         return 0
     fi
 
@@ -52,6 +52,17 @@ resolve_repo_root() {
 
     find_sampleworks_root_upwards "$PWD" && return 0
 
+    # Legacy fallback for older images/users. This intentionally comes after
+    # /home/dev/workspace so inherited SAMPLEWORKS_APP_DIR=/app cannot mask the
+    # synced checkout.
+    local app_override="${SAMPLEWORKS_APP_DIR:-}"
+    if [[ -n "$app_override" ]]; then
+        if is_sampleworks_root "$app_override"; then
+            printf '%s\n' "$app_override"
+            return 0
+        fi
+    fi
+
     if is_sampleworks_root "$script_dir"; then
         printf '%s\n' "$script_dir"
         return 0

From 21f51bdfe62805e30e61a72d8d9ca74a861ddecb Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 11:26:03 -0400
Subject: [PATCH 13/28] docs(runs): use pixi-with-checkpoints latest image

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 07a95646..10d2ee06 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,7 @@ Use ACTL to get a ready-to-run pod with baked pixi environments, checkpoints,
 and the shared data PVC:
 
 ```bash
-actl pod up sampleworks-pr236 --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:cuda12.4-2026-05-21-pr240-workspace1 --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+actl pod up sampleworks-pr236 --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
 ```
 
 ACTL syncs your local checkout to `/home/dev/workspace`; interactive shells land

From 0383692774d3fa6ec89cfe4428556e7debd9f054 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 11:31:19 -0400
Subject: [PATCH 14/28] fix(runs): refuse runtime pixi reinstalls

---
 Dockerfile                     |  3 ++-
 run_experiments                | 12 ++++++++++++
 src/sampleworks/runs/runner.py |  3 +++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b7d65b8e..f1202b90 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -139,7 +139,8 @@ COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local
 RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n    cd /home/dev/workspace\nfi\n' >> /root/.bashrc
 
 ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app \
-    SAMPLEWORKS_APP_DIR=
+    SAMPLEWORKS_APP_DIR= \
+    SAMPLEWORKS_REQUIRE_PREBUILT_PIXI=1
 
 # Set default checkpoint paths via environment variables
 ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \
diff --git a/run_experiments b/run_experiments
index 2d1d81ec..996ee7ee 100755
--- a/run_experiments
+++ b/run_experiments
@@ -199,6 +199,18 @@ if [[ -x "$runner_python" ]]; then
         "$@"
 fi
 
+if [[ "${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-}" =~ ^(1|true|yes)$ ]]; then
+    cat >&2 <<EOF
+Prebuilt pixi environment is missing: $runner_python
+
+This image is supposed to contain ready-to-use pixi environments under /app/.pixi.
+Refusing to run 'pixi run' because that would reinstall CUDA packages inside the pod.
+Use a rebuilt pixi-with-checkpoints image or unset SAMPLEWORKS_REQUIRE_PREBUILT_PIXI
+if you intentionally want runtime pixi installation.
+EOF
+    exit 2
+fi
+
 exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
     "$preset" \
     --results-dir "$RESULTS_DIR" \
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 0e7d3f57..46b626e4 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -415,6 +415,9 @@ def _prepare_pixi_env(pixi_env: str) -> None:
     subprocess.CalledProcessError
         If pixi cannot prepare the environment.
     """
+    if _pixi_env_python(pixi_env) is not None:
+        return
+
     if os.environ.get("SAMPLEWORKS_SKIP_ENV_PREPARE", "").lower() in {
         "1",
         "true",

From a659786750cb520215c5777caa951ef70d4ba7f1 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 15:04:53 -0400
Subject: [PATCH 15/28] fix(runs): skip pixi prepare in baked runtime

---
 run_experiments | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/run_experiments b/run_experiments
index 996ee7ee..07c0335c 100755
--- a/run_experiments
+++ b/run_experiments
@@ -106,6 +106,11 @@ export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_r
 export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
 export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}"
 export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
+# The ACTL image is expected to provide ready-to-use pixi envs under /app/.pixi.
+# Do not let sampleworks.runs.runner call `pixi run` just to "prepare" envs;
+# that can reinstall the CUDA stack inside the pod, especially when the synced
+# workspace source is older than this wrapper.
+export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-1}"
 
 shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
 for checkpoint_var_and_file in \

From 535c66566162b4a693055e34b0777cb55cbb9c78 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Fri, 22 May 2026 15:17:59 -0400
Subject: [PATCH 16/28] fix(runs): require baked pixi envs in ACTL

---
 README.md                      | 23 +++++++++----
 run_experiments                | 15 +++++----
 src/sampleworks/runs/cli.py    |  6 +++-
 src/sampleworks/runs/runner.py | 59 ++++++++++++++++++++++++++++++++++
 tests/runs/test_runner.py      | 14 ++++++++
 5 files changed, 104 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 10d2ee06..082e7304 100644
--- a/README.md
+++ b/README.md
@@ -154,22 +154,33 @@ Instructions for running evaluation and metrics scripts are coming soon.
 
 ## ACTL preset experiments (`run_experiments`)
 
-Use ACTL to get a ready-to-run pod with baked pixi environments, checkpoints,
-and the shared data PVC:
+Use ACTL to get one 8-GPU pod with baked pixi environments, checkpoints, the
+shared data PVC, and your local checkout synced to `/home/dev/workspace`:
 
 ```bash
-actl pod up sampleworks-pr236 --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+actl pod up sampleworks-pr240 --fresh --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
 ```
 
-ACTL syncs your local checkout to `/home/dev/workspace`; interactive shells land
-there. Run experiments from that synced checkout, not from `/app`:
+Keep that `actl pod up` terminal open so sync/SSH stays alive. In another
+terminal, copy the `ssh:` line from `actl pod status sampleworks-pr240`, then:
 
 ```bash
+ssh workspace.actl-ws-<user>-sampleworks-pr240.devspace
+cd /home/dev/workspace
 run_experiments --dry-run
 run_experiments all_models
 ```
 
-`run_experiments` is a thin wrapper around `sampleworks-runs`: it reads TOML presets and launches the requested `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job. The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2 MD, RF3, and Protenix.
+`run_experiments` is the entrypoint. It uses the synced source tree from
+`/home/dev/workspace`, but runs the baked interpreters from
+`/app/.pixi/envs/{boltz,protenix,rf3}/bin/python` directly. It should not run
+`pixi install` or pull packages at runtime; if an env is missing, recreate the
+pod with the current `pixi-with-checkpoints` image.
+
+`run_experiments` reads TOML presets and launches the requested
+`run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job.
+The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2
+MD, RF3, and Protenix.
 
 Presets live in the synced repo at `src/sampleworks/runs/presets/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
 
diff --git a/run_experiments b/run_experiments
index 07c0335c..72850c88 100755
--- a/run_experiments
+++ b/run_experiments
@@ -110,6 +110,7 @@ export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
 # Do not let sampleworks.runs.runner call `pixi run` just to "prepare" envs;
 # that can reinstall the CUDA stack inside the pod, especially when the synced
 # workspace source is older than this wrapper.
+export SAMPLEWORKS_REQUIRE_PREBUILT_PIXI="${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-1}"
 export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-1}"
 
 shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
@@ -204,14 +205,16 @@ if [[ -x "$runner_python" ]]; then
         "$@"
 fi
 
-if [[ "${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-}" =~ ^(1|true|yes)$ ]]; then
+if [[ ! "${SAMPLEWORKS_ALLOW_RUNTIME_PIXI:-}" =~ ^(1|true|yes)$ ]]; then
     cat >&2 <<EOF
-Prebuilt pixi environment is missing: $runner_python
+Prebuilt runner pixi environment is missing: $runner_python
 
-This image is supposed to contain ready-to-use pixi environments under /app/.pixi.
-Refusing to run 'pixi run' because that would reinstall CUDA packages inside the pod.
-Use a rebuilt pixi-with-checkpoints image or unset SAMPLEWORKS_REQUIRE_PREBUILT_PIXI
-if you intentionally want runtime pixi installation.
+run_experiments is for the ACTL pixi-with-checkpoints image, which must contain
+ready-to-use environments under /app/.pixi. Refusing to run 'pixi run' because
+that would install or refresh packages inside the pod.
+
+Recreate the pod with the current pixi-with-checkpoints image. If you are
+intentionally debugging runtime pixi setup, set SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1.
 EOF
     exit 2
 fi
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 7cb64f93..8201872e 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -46,7 +46,11 @@ def main(argv: list[str] | None = None) -> int:
         return 0
 
     results_dir = Path(args.results_dir or _default_results_dir(preset))
-    return runner.run(preset, results_dir=results_dir, dry_run=args.dry_run)
+    try:
+        return runner.run(preset, results_dir=results_dir, dry_run=args.dry_run)
+    except RuntimeError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
 
 
 def _build_parser() -> argparse.ArgumentParser:
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 46b626e4..12570a0c 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -221,6 +221,8 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]:
     env_python = _pixi_env_python(pixi_env)
     if env_python:
         argv = [env_python, _grid_search_script()]
+    elif _require_prebuilt_envs():
+        raise RuntimeError(_missing_prebuilt_env_message(pixi_env))
     else:
         argv = ["pixi", "run", "-e", pixi_env, "python", _grid_search_script()]
     for key, value in args.items():
@@ -266,6 +268,60 @@ def _pixi_env_python(pixi_env: str) -> str | None:
     return None
 
 
+def _truthy_env(name: str) -> bool:
+    """Return True when an environment variable is set to a truthy value.
+
+    Parameters
+    ----------
+    name : str
+        Environment variable name to inspect.
+
+    Returns
+    -------
+    bool
+        True for ``1``, ``true``, or ``yes`` values, case-insensitive.
+    """
+    return os.environ.get(name, "").lower() in {"1", "true", "yes"}
+
+
+def _require_prebuilt_envs() -> bool:
+    """Return True when runtime pixi fallback must be disabled.
+
+    Returns
+    -------
+    bool
+        True when the ACTL wrapper/image requires baked pixi environments and
+        the caller has not explicitly opted into runtime pixi installation.
+    """
+    return _truthy_env("SAMPLEWORKS_REQUIRE_PREBUILT_PIXI") and not _truthy_env(
+        "SAMPLEWORKS_ALLOW_RUNTIME_PIXI"
+    )
+
+
+def _missing_prebuilt_env_message(pixi_env: str) -> str:
+    """Build the error message for a missing baked pixi environment.
+
+    Parameters
+    ----------
+    pixi_env : str
+        Required pixi environment name.
+
+    Returns
+    -------
+    str
+        Human-readable error message explaining how to fix the pod/image.
+    """
+    expected = _pixi_project_dir() / ".pixi" / "envs" / pixi_env / "bin" / "python"
+    return (
+        f"Prebuilt pixi environment is missing for job env {pixi_env!r}: {expected}. "
+        "The pixi-with-checkpoints image must contain ready-to-use boltz, "
+        "protenix, and rf3 environments. Refusing to fall back to 'pixi run' "
+        "because that would install or refresh packages inside the pod. "
+        "Recreate the pod with the current image, or set "
+        "SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 only when intentionally debugging pixi."
+    )
+
+
 def _job_env(pixi_env: str, env: dict[str, str]) -> dict[str, str]:
     """Return an environment equivalent to activating a direct pixi env.
 
@@ -418,6 +474,9 @@ def _prepare_pixi_env(pixi_env: str) -> None:
     if _pixi_env_python(pixi_env) is not None:
         return
 
+    if _require_prebuilt_envs():
+        raise RuntimeError(_missing_prebuilt_env_message(pixi_env))
+
     if os.environ.get("SAMPLEWORKS_SKIP_ENV_PREPARE", "").lower() in {
         "1",
         "true",
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 140acf2d..be4b9cef 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -171,6 +171,20 @@ def test_uses_baked_env_python_when_available(
     assert inv.argv[:2] == [str(python_bin), "/app/run_grid_search.py"]
 
 
+def test_prebuilt_env_required_rejects_runtime_pixi(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """ACTL runs fail clearly instead of installing missing pixi envs at runtime."""
+    monkeypatch.delenv("SAMPLEWORKS_FORCE_PIXI", raising=False)
+    monkeypatch.setenv("SAMPLEWORKS_REQUIRE_PREBUILT_PIXI", "1")
+    monkeypatch.delenv("SAMPLEWORKS_ALLOW_RUNTIME_PIXI", raising=False)
+    monkeypatch.setenv("SAMPLEWORKS_PIXI_PROJECT_DIR", str(tmp_path / "app"))
+
+    preset = loader.load_preset("rf3_partial")
+    with pytest.raises(RuntimeError, match="Refusing to fall back to 'pixi run'"):
+        runner.build_invocations(preset, results_dir=Path("/r"))
+
+
 def test_dry_run_does_not_create_directories(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From e51e368e7bc5ae239866952114904dda0f4bd398 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 12:30:59 -0400
Subject: [PATCH 17/28] feat(runs): simplify experiment target syntax

---
 README.md                                     |  14 ++-
 run_experiments                               | 118 +++++++++++++++---
 src/sampleworks/runs/cli.py                   | 100 ++++++++++++---
 .../{all_models.toml => full_8gpu.toml}       |   4 +-
 src/sampleworks/runs/runner.py                |   5 +-
 src/sampleworks/runs/schema.py                |   6 +-
 tests/runs/test_cli.py                        |  46 +++++--
 tests/runs/test_loader.py                     |  10 +-
 tests/runs/test_runner.py                     |   4 +-
 9 files changed, 240 insertions(+), 67 deletions(-)
 rename src/sampleworks/runs/presets/{all_models.toml => full_8gpu.toml} (86%)

diff --git a/README.md b/README.md
index 082e7304..c5440ec3 100644
--- a/README.md
+++ b/README.md
@@ -168,7 +168,8 @@ terminal, copy the `ssh:` line from `actl pod status sampleworks-pr240`, then:
 ssh workspace.actl-ws-<user>-sampleworks-pr240.devspace
 cd /home/dev/workspace
 run_experiments --dry-run
-run_experiments all_models
+run_experiments
+run_experiments rf3
 ```
 
 `run_experiments` is the entrypoint. It uses the synced source tree from
@@ -179,20 +180,21 @@ pod with the current `pixi-with-checkpoints` image.
 
 `run_experiments` reads TOML presets and launches the requested
 `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job.
-The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2
-MD, RF3, and Protenix.
+The default preset is `full_8gpu`, which splits GPUs across Boltz2 XRD, Boltz2
+MD, RF3, and Protenix. A positional target like `rf3` or `rf3,protenix` runs
+those jobs from `full_8gpu`; use `--preset rf3_partial` for a specific preset.
 
 Presets live in the synced repo at `src/sampleworks/runs/presets/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
 
 ```bash
-run_experiments all_models --only rf3,protenix
-run_experiments rf3_partial --set jobs.rf3.gpus=0
+run_experiments rf3,protenix
+run_experiments --preset rf3_partial --set jobs.rf3.gpus=0
 ```
 
 On smaller pods, make sure preset GPU IDs only reference visible pod GPUs
 (`0..N-1`). `run_experiments` fails fast if a preset requests unavailable GPUs.
 
-The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks/<pod>/<preset>/`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations. `run_all_models.sh` remains as a compatibility alias.
+The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations.
 
 
 ## Docker
diff --git a/run_experiments b/run_experiments
index 72850c88..88adf215 100755
--- a/run_experiments
+++ b/run_experiments
@@ -80,27 +80,92 @@ EOF
 
 repo_root="$(resolve_repo_root)"
 
-preset="${SAMPLEWORKS_PRESET:-all_models}"
-if [[ $# -gt 0 && "$1" != -* ]]; then
-    preset="$1"
-    shift
-fi
+env_preset="${SAMPLEWORKS_PRESET:-}"
+default_target="${env_preset:-full_8gpu}"
+target=""
+explicit_preset=""
+explicit_jobs=""
+explicit_results_dir=""
+expect_value_for=""
+for arg in "$@"; do
+    if [[ -n "$expect_value_for" ]]; then
+        case "$expect_value_for" in
+            preset)
+                explicit_preset="$arg"
+                ;;
+            results-dir)
+                explicit_results_dir="$arg"
+                ;;
+            jobs)
+                explicit_jobs="$arg"
+                ;;
+        esac
+        expect_value_for=""
+        continue
+    fi
 
-if [[ "$preset" == *.toml || "$preset" == */* ]]; then
-    if [[ "$preset" != /* ]]; then
-        preset="$repo_root/$preset"
+    case "$arg" in
+        --preset)
+            expect_value_for="preset"
+            ;;
+        --preset=*)
+            explicit_preset="${arg#--preset=}"
+            ;;
+        --results-dir)
+            expect_value_for="results-dir"
+            ;;
+        --results-dir=*)
+            explicit_results_dir="${arg#--results-dir=}"
+            ;;
+        --jobs)
+            expect_value_for="jobs"
+            ;;
+        --jobs=*)
+            explicit_jobs="${arg#--jobs=}"
+            ;;
+        -*)
+            ;;
+        *)
+            if [[ -z "$target" ]]; then
+                target="$arg"
+            fi
+            ;;
+    esac
+done
+
+label_source="$default_target"
+if [[ -n "$explicit_preset" ]]; then
+    label_source="$explicit_preset"
+elif [[ -n "$explicit_jobs" && ( -z "$target" || "$target" == "all" || "$target" == "full" || "$target" == "full_8gpu" ) ]]; then
+    label_source="$explicit_jobs"
+elif [[ -n "$target" ]]; then
+    label_source="$target"
+fi
+case "$label_source" in
+    all|full)
+        label_source="full_8gpu"
+        ;;
+esac
+if [[ "$label_source" == *.toml || "$label_source" == */* ]]; then
+    if [[ "$label_source" != /* ]]; then
+        label_source="$repo_root/$label_source"
     fi
 fi
-preset_label="${preset##*/}"
-preset_label="${preset_label%.toml}"
+run_label="${label_source##*/}"
+run_label="${run_label%.toml}"
+run_label="${run_label//,/_}"
 
 run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}"
 default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
-default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}"
+default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${run_label}"
 default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa"
 
 export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}"
-export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}"
+if [[ -n "$explicit_results_dir" ]]; then
+    export RESULTS_DIR="$explicit_results_dir"
+else
+    export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}"
+fi
 export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}"
 export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}"
 export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
@@ -152,6 +217,16 @@ if [[ -z "$pixi_project_dir" ]]; then
 fi
 runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}"
 
+extra_cli_args=()
+if [[ $# -eq 0 && -n "$env_preset" ]]; then
+    extra_cli_args=(--preset "$env_preset")
+fi
+
+display_target="${target:-${explicit_preset:-$default_target}}"
+if [[ -n "$explicit_jobs" ]]; then
+    display_target="$display_target --jobs $explicit_jobs"
+fi
+
 needs_runtime_paths=1
 for arg in "$@"; do
     case "$arg" in
@@ -181,7 +256,7 @@ fi
 
 cat >&2 <<EOF
 Sampleworks preset run
-  preset:        $preset
+  target:        $display_target
   data:          $DATA_DIR
   results:       $RESULTS_DIR
   msa cache:     $MSA_CACHE_DIR
@@ -192,15 +267,20 @@ Sampleworks preset run
 
 EOF
 
-cd "$pixi_project_dir"
 if [[ -x "$runner_python" ]]; then
     runner_env_dir="$(cd -- "$(dirname -- "$runner_python")/.." && pwd)"
     export PATH="$runner_env_dir/bin${PATH:+:$PATH}"
     export CONDA_PREFIX="$runner_env_dir"
     export CUDA_HOME="${CUDA_HOME:-$runner_env_dir}"
     export PYTHONNOUSERSITE=1
+    cd "$repo_root"
+    if [[ "${#extra_cli_args[@]}" -gt 0 ]]; then
+        exec "$runner_python" -m sampleworks.runs.cli \
+            --results-dir "$RESULTS_DIR" \
+            "${extra_cli_args[@]}" \
+            "$@"
+    fi
     exec "$runner_python" -m sampleworks.runs.cli \
-        "$preset" \
         --results-dir "$RESULTS_DIR" \
         "$@"
 fi
@@ -219,7 +299,13 @@ EOF
     exit 2
 fi
 
+cd "$pixi_project_dir"
+if [[ "${#extra_cli_args[@]}" -gt 0 ]]; then
+    exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
+        --results-dir "$RESULTS_DIR" \
+        "${extra_cli_args[@]}" \
+        "$@"
+fi
 exec pixi run -e "$runner_env" python -m sampleworks.runs.cli \
-    "$preset" \
     --results-dir "$RESULTS_DIR" \
     "$@"
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 8201872e..b669be7c 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -11,6 +11,10 @@
 from .schema import Preset
 
 
+DEFAULT_PRESET = "full_8gpu"
+DEFAULT_PRESET_ALIASES = frozenset({"all", "full", "full_8gpu"})
+
+
 def main(argv: list[str] | None = None) -> int:
     """Entry point for the ``sampleworks-runs`` console script.
 
@@ -34,12 +38,10 @@ def main(argv: list[str] | None = None) -> int:
             print(name)
         return 0
 
-    if args.preset is None:
-        parser.error("PRESET is required (or pass --list)")
-
-    preset = loader.load_preset(args.preset, overrides=args.set)
-    if args.only:
-        preset = _filter_only(preset, args.only)
+    preset_name, job_filter = _resolve_target(args.target, args.preset, args.jobs, parser)
+    preset = loader.load_preset(preset_name, overrides=args.set)
+    if job_filter:
+        preset = _filter_jobs(preset, job_filter)
 
     if args.show:
         _print_show(preset)
@@ -64,23 +66,35 @@ def _build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         prog="sampleworks-runs",
         description=(
-            "Run a preset of parallel run_grid_search.py jobs. "
-            "Presets are TOML files bundled under sampleworks.runs.presets, "
-            "or pass a path to a .toml file directly."
+            "Run Sampleworks experiment presets. With no target, runs the "
+            "full_8gpu preset. A target like 'rf3' or 'rf3,protenix' is a "
+            "job shortcut from full_8gpu; use --preset for another TOML preset."
+        ),
+    )
+    parser.add_argument(
+        "target",
+        nargs="?",
+        help=(
+            "Job shortcut from full_8gpu (rf3, protenix, boltz2_xrd, "
+            "boltz2_md, or comma-separated), or 'full'/'full_8gpu'."
         ),
     )
-    parser.add_argument("preset", nargs="?", help="Bundled preset name or path to a .toml file")
+    parser.add_argument(
+        "--preset",
+        default="",
+        help="Bundled preset name or path to a .toml file. Default: full_8gpu.",
+    )
     parser.add_argument("--list", action="store_true", help="List bundled presets and exit")
     parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit")
     parser.add_argument(
         "--dry-run",
         action="store_true",
-        help="Print the pixi run commands instead of executing them",
+        help="Print the resolved job commands instead of executing them",
     )
     parser.add_argument(
-        "--only",
+        "--jobs",
         default="",
-        help="Comma-separated job names to run (subset). Default: all jobs.",
+        help="Comma-separated job names to run from the selected preset. Default: all jobs.",
     )
     parser.add_argument(
         "--set",
@@ -102,14 +116,58 @@ def _build_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def _filter_only(preset: Preset, only: str) -> Preset:
+def _resolve_target(
+    target: str | None,
+    preset: str,
+    jobs: str,
+    parser: argparse.ArgumentParser,
+) -> tuple[str, str]:
+    """Resolve the user-facing target grammar into preset plus job filter.
+
+    Parameters
+    ----------
+    target : str or None
+        Optional positional target. Without ``--preset`` this is either a
+        default preset alias (``full``/``full_8gpu``/``all``) or a job selector
+        from :data:`DEFAULT_PRESET`. With ``--preset`` it is a shorthand job
+        selector for that explicit preset.
+    preset : str
+        Explicit preset name/path from ``--preset``.
+    jobs : str
+        Explicit comma-separated job selector from ``--jobs``.
+    parser : argparse.ArgumentParser
+        Parser used to report grammar errors.
+
+    Returns
+    -------
+    tuple of str, str
+        ``(preset_name_or_path, comma_separated_job_filter)``.
+    """
+    if preset:
+        if target and jobs:
+            parser.error("pass jobs either as the positional target or with --jobs, not both")
+        return preset, jobs or target or ""
+
+    if target is None or target in DEFAULT_PRESET_ALIASES:
+        return DEFAULT_PRESET, jobs
+
+    if jobs:
+        parser.error("pass jobs either as the positional target or with --jobs, not both")
+
+    if target.endswith(".toml") or "/" in target:
+        parser.error("pass custom preset paths with --preset path/to/preset.toml")
+
+    return DEFAULT_PRESET, target
+
+
+def _filter_jobs(preset: Preset, jobs: str) -> Preset:
     """Return a new :class:`Preset` containing only the named jobs.
 
     Parameters
     ----------
     preset : Preset
         Source preset.
-    only : str
+    jobs : str
         Comma-separated list of job names to keep.
 
     Returns
@@ -121,16 +179,18 @@ def _filter_only(preset: Preset, only: str) -> Preset:
     Raises
     ------
     SystemExit
-        If any name in ``only`` does not match a job in ``preset``.
+        If any name in ``jobs`` does not match a job in ``preset``.
     """
-    names = [n.strip() for n in only.split(",") if n.strip()]
+    names = [n.strip() for n in jobs.split(",") if n.strip()]
     keep = [j for j in preset.jobs if j.name in names]
     missing = set(names) - {j.name for j in keep}
     if missing:
-        raise SystemExit(f"--only references unknown jobs: {sorted(missing)}")
+        raise SystemExit(f"job selector references unknown jobs: {sorted(missing)}")
+    description = f"Subset of {preset.name}: {', '.join(names)}"
+    name = f"{preset.name}:{','.join(names)}"
     return Preset(
-        name=preset.name,
-        description=preset.description,
+        name=name,
+        description=description,
         defaults=preset.defaults,
         shared_args=preset.shared_args,
         jobs=keep,
diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/full_8gpu.toml
similarity index 86%
rename from src/sampleworks/runs/presets/all_models.toml
rename to src/sampleworks/runs/presets/full_8gpu.toml
index a00461fb..fb700177 100644
--- a/src/sampleworks/runs/presets/all_models.toml
+++ b/src/sampleworks/runs/presets/full_8gpu.toml
@@ -1,8 +1,8 @@
-description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)."
+description = "Full 8-GPU panel: Boltz2 X-ray, Boltz2 MD, RF3, and Protenix."
 
 [defaults]
 DATA_DIR = "/data/inputs"
-RESULTS_DIR = "/data/results/all_models"
+RESULTS_DIR = "/data/results/full_8gpu"
 MSA_CACHE_DIR = "/root/.sampleworks"
 PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
 
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 12570a0c..832d93b7 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -28,7 +28,8 @@ class JobInvocation:
     job : Job
         Originating :class:`Job` (kept for introspection in logs).
     argv : list of str
-        Subprocess command line (starts with ``pixi run -e <env> python ...``).
+        Subprocess command line, preferably the baked pixi env Python followed
+        by ``run_grid_search.py``.
     env : dict of str to str
         Process environment, including ``CUDA_VISIBLE_DEVICES``.
     log_path : Path
@@ -182,7 +183,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
         raise RuntimeError(
             "Preset requests GPUs that are not visible in this pod. "
             f"Visible GPUs: {', '.join(available)}. {details}. "
-            "Edit the preset's jobs.*.gpus values or run a smaller --only subset."
+            "Edit the preset's jobs.*.gpus values or run a smaller --jobs subset."
         )
 
     allow_oversubscription = os.environ.get(
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index b57a84e9..f21494f6 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -1,8 +1,8 @@
 """Dataclasses for the preset schema.
 
 A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
-is launched as ``pixi run -e <env> python <run_grid_search.py> <args>`` with
-``CUDA_VISIBLE_DEVICES`` set to the job's GPU assignment.
+is launched in its configured model environment with ``CUDA_VISIBLE_DEVICES``
+set to the job's GPU assignment.
 """
 
 from __future__ import annotations
@@ -21,7 +21,7 @@ class Job:
     Parameters
     ----------
     name : str
-        Identifier used for per-job log files and ``--only`` selection. Must be
+        Identifier used for per-job log files and ``--jobs`` selection. Must be
         unique within the parent :class:`Preset`.
     env : str
         Pixi environment to run the job in. Must be one of
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index 1023f0b3..adbe5432 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -1,4 +1,4 @@
-"""End-to-end CLI tests (--list, --show, --dry-run, --only)."""
+"""End-to-end CLI tests (--list, --show, --dry-run, job shortcuts)."""
 
 from __future__ import annotations
 
@@ -13,7 +13,7 @@ def test_list_prints_all_bundled_presets(capsys: pytest.CaptureFixture[str]) ->
     assert exit_code == 0
     out = capsys.readouterr().out.splitlines()
     assert set(out) == {
-        "all_models",
+        "full_8gpu",
         "rf3_partial",
         "rf3_partial_chiral_off",
         "protenix_dual",
@@ -25,7 +25,7 @@ def test_show_prints_resolved_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    exit_code = cli.main(["rf3_partial", "--show"])
+    exit_code = cli.main(["--preset", "rf3_partial", "--show"])
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "name: rf3_partial" in out
@@ -36,30 +36,49 @@ def test_dry_run_does_not_invoke_subprocess(
     monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str]
 ) -> None:
     monkeypatch.setenv("HOME", str(tmp_path))
-    exit_code = cli.main(["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)])
+    exit_code = cli.main([
+        "--preset",
+        "rf3_partial",
+        "--dry-run",
+        "--results-dir",
+        str(tmp_path),
+    ])
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "pixi run -e rf3 python /app/run_grid_search.py" in out
     assert "CUDA_VISIBLE_DEVICES=4" in out
 
 
-def test_only_filters_to_subset(
+def test_job_shortcut_filters_default_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    exit_code = cli.main(["all_models", "--only", "rf3,protenix", "--show"])
+    exit_code = cli.main(["rf3,protenix", "--show"])
     assert exit_code == 0
     out = capsys.readouterr().out
+    assert "name: full_8gpu:rf3,protenix" in out
     assert "name: rf3" in out
     assert "name: protenix" in out
     assert "boltz2_xrd" not in out
     assert "boltz2_md" not in out
 
 
-def test_only_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -> None:
+def test_jobs_filters_explicit_preset(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(["--preset", "full_8gpu", "--jobs", "rf3", "--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: full_8gpu:rf3" in out
+    assert "name: rf3" in out
+    assert "protenix" not in out
+
+
+def test_job_shortcut_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
     with pytest.raises(SystemExit, match="unknown jobs"):
-        cli.main(["all_models", "--only", "nonexistent", "--show"])
+        cli.main(["nonexistent", "--show"])
 
 
 def test_set_override_propagates_through_cli(
@@ -68,6 +87,7 @@ def test_set_override_propagates_through_cli(
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(
         [
+            "--preset",
             "rf3_partial",
             "--set",
             "jobs.rf3.args.gradient-weights=0.0 0.01",
@@ -79,7 +99,11 @@ def test_set_override_propagates_through_cli(
     assert "0.0 0.01" in out
 
 
-def test_no_preset_and_no_list_errors(monkeypatch: pytest.MonkeyPatch) -> None:
+def test_no_target_defaults_to_full_8gpu(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    with pytest.raises(SystemExit):
-        cli.main([])
+    exit_code = cli.main(["--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: full_8gpu" in out
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 7a2bb1b1..1e050572 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -8,7 +8,7 @@
 from sampleworks.runs import loader
 
 
-BUNDLED = ["all_models", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
+BUNDLED = ["full_8gpu", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
 
 
 def test_list_bundled_presets_returns_the_five() -> None:
@@ -43,11 +43,11 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
     assert preset.defaults["DATA_DIR"] == "/data/inputs"
 
 
-def test_all_models_uses_canonical_inputs_dir(monkeypatch: pytest.MonkeyPatch) -> None:
+def test_full_8gpu_uses_canonical_inputs_dir(monkeypatch: pytest.MonkeyPatch) -> None:
     """The flagship preset must use /data/inputs, matching the ACTL wrapper."""
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
-    preset = loader.load_preset("all_models")
+    preset = loader.load_preset("full_8gpu")
     assert preset.defaults["DATA_DIR"] == "/data/inputs"
     assert preset.shared_args["proteins"] == "/data/inputs/proteins.csv"
 
@@ -62,13 +62,13 @@ def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
 
 def test_set_override_at_job_by_name(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    preset = loader.load_preset("all_models", overrides=["jobs.rf3.gpus=7"])
+    preset = loader.load_preset("full_8gpu", overrides=["jobs.rf3.gpus=7"])
     assert preset.job("rf3").gpus == "7"
 
 
 def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    preset = loader.load_preset("all_models", overrides=["jobs.0.gpus=9"])
+    preset = loader.load_preset("full_8gpu", overrides=["jobs.0.gpus=9"])
     assert preset.jobs[0].gpus == "9"
 
 
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index be4b9cef..9305368d 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -64,11 +64,11 @@ def test_explicit_output_dir_in_args_wins_over_subdir_default(
     assert pairs["--output-dir"] == "/explicit/path"
 
 
-def test_all_models_has_four_jobs_with_distinct_gpus(
+def test_full_8gpu_has_four_jobs_with_distinct_gpus(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     monkeypatch.setenv("HOME", "/home/test")
-    preset = loader.load_preset("all_models")
+    preset = loader.load_preset("full_8gpu")
     invocations = runner.build_invocations(preset, results_dir=Path("/r"))
     assert [i.job.name for i in invocations] == ["boltz2_xrd", "boltz2_md", "rf3", "protenix"]
     gpu_assignments = [i.env["CUDA_VISIBLE_DEVICES"] for i in invocations]

From c60bea7fa7d588272774d92b9c712cee6f49c47d Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 13:17:56 -0400
Subject: [PATCH 18/28] feat(runs): move presets to experiments directory

---
 Dockerfile                                    |   3 +-
 README.md                                     |   2 +-
 .../presets => experiments}/full_8gpu.toml    |   0
 .../protenix_dual.toml                        |   0
 .../presets => experiments}/rf3_partial.toml  |   0
 .../rf3_partial_chiral_off.toml               |   0
 .../presets => experiments}/rf3_protenix.toml |   0
 pyproject.toml                                |   2 +-
 src/sampleworks/runs/cli.py                   |   6 +-
 src/sampleworks/runs/loader.py                | 132 ++++++++++++++----
 src/sampleworks/runs/schema.py                |   2 +-
 tests/runs/test_cli.py                        |   2 +-
 tests/runs/test_loader.py                     |  33 ++++-
 13 files changed, 146 insertions(+), 36 deletions(-)
 rename {src/sampleworks/runs/presets => experiments}/full_8gpu.toml (100%)
 rename {src/sampleworks/runs/presets => experiments}/protenix_dual.toml (100%)
 rename {src/sampleworks/runs/presets => experiments}/rf3_partial.toml (100%)
 rename {src/sampleworks/runs/presets => experiments}/rf3_partial_chiral_off.toml (100%)
 rename {src/sampleworks/runs/presets => experiments}/rf3_protenix.toml (100%)

diff --git a/Dockerfile b/Dockerfile
index f1202b90..b45067b9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -97,6 +97,7 @@ WORKDIR /app
 # Copy all project files - needed because sampleworks is installed as editable package
 # The pypi-dependencies section has: sampleworks = {editable = true, path = "."}
 COPY pyproject.toml pixi.lock ./
+COPY experiments/ ./experiments/
 COPY src/ ./src/
 COPY scripts/ ./scripts/
 COPY run_grid_search.py ./
@@ -132,7 +133,7 @@ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-comp
 # This image carries pixi environments and checkpoints. Runtime source should
 # come from ACTL's synced checkout at /home/dev/workspace, not from stale code
 # baked into /app during image construction.
-RUN rm -rf /app/src /app/scripts /app/run_grid_search.py \
+RUN rm -rf /app/src /app/scripts /app/experiments /app/run_grid_search.py \
     && mkdir -p /home/dev/workspace
 
 COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/
diff --git a/README.md b/README.md
index c5440ec3..7cc29b05 100644
--- a/README.md
+++ b/README.md
@@ -184,7 +184,7 @@ The default preset is `full_8gpu`, which splits GPUs across Boltz2 XRD, Boltz2
 MD, RF3, and Protenix. A positional target like `rf3` or `rf3,protenix` runs
 those jobs from `full_8gpu`; use `--preset rf3_partial` for a specific preset.
 
-Presets live in the synced repo at `src/sampleworks/runs/presets/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
+Presets live in the synced repo at `experiments/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
 
 ```bash
 run_experiments rf3,protenix
diff --git a/src/sampleworks/runs/presets/full_8gpu.toml b/experiments/full_8gpu.toml
similarity index 100%
rename from src/sampleworks/runs/presets/full_8gpu.toml
rename to experiments/full_8gpu.toml
diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/experiments/protenix_dual.toml
similarity index 100%
rename from src/sampleworks/runs/presets/protenix_dual.toml
rename to experiments/protenix_dual.toml
diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/experiments/rf3_partial.toml
similarity index 100%
rename from src/sampleworks/runs/presets/rf3_partial.toml
rename to experiments/rf3_partial.toml
diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/experiments/rf3_partial_chiral_off.toml
similarity index 100%
rename from src/sampleworks/runs/presets/rf3_partial_chiral_off.toml
rename to experiments/rf3_partial_chiral_off.toml
diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/experiments/rf3_protenix.toml
similarity index 100%
rename from src/sampleworks/runs/presets/rf3_protenix.toml
rename to experiments/rf3_protenix.toml
diff --git a/pyproject.toml b/pyproject.toml
index 71368fc8..2b958183 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ sampleworks-guidance = "sampleworks.cli.guidance:main"
 sampleworks-runs = "sampleworks.runs.cli:main"
 
 [tool.hatch.build.targets.wheel.force-include]
-"src/sampleworks/runs/presets" = "sampleworks/runs/presets"
+"experiments" = "experiments"
 
 [tool.hatch.metadata]
 allow-direct-references = true
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index b669be7c..b0b94c85 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -34,7 +34,7 @@ def main(argv: list[str] | None = None) -> int:
     args = parser.parse_args(argv)
 
     if args.list:
-        for name in loader.list_bundled_presets():
+        for name in loader.list_presets():
             print(name)
         return 0
 
@@ -82,9 +82,9 @@ def _build_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--preset",
         default="",
-        help="Bundled preset name or path to a .toml file. Default: full_8gpu.",
+        help="Preset name from experiments/ or path to a .toml file. Default: full_8gpu.",
     )
-    parser.add_argument("--list", action="store_true", help="List bundled presets and exit")
+    parser.add_argument("--list", action="store_true", help="List experiments/*.toml presets and exit")
     parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit")
     parser.add_argument(
         "--dry-run",
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index ce8b130c..1adce888 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -12,39 +12,45 @@
 import re
 import tomllib
 from collections.abc import Iterable
-from importlib import resources
 from pathlib import Path
 from typing import Any
 
 from .schema import Job, Preset
 
 
-_BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets"
+_EXPERIMENTS_DIR_NAME = "experiments"
 _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
 _TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
 
 
-def list_bundled_presets() -> list[str]:
-    """List the names of all TOML presets shipped with the package.
+def list_presets() -> list[str]:
+    """List experiment preset names from the top-level ``experiments`` directory.
 
     Returns
     -------
     list of str
         Preset names (filename stems, no ``.toml`` extension), sorted
-        alphabetically.
+        alphabetically. If multiple experiment directories are visible, the
+        first directory in the resolution order wins for duplicate names.
     """
-    files = resources.files(_BUNDLED_PRESETS_PACKAGE)
-    return sorted(p.name.removesuffix(".toml") for p in files.iterdir() if p.name.endswith(".toml"))
+    names: dict[str, Path] = {}
+    for directory in _experiment_dirs():
+        if not directory.is_dir():
+            continue
+        for path in directory.iterdir():
+            if path.is_file() and path.suffix == ".toml":
+                names.setdefault(path.stem, path)
+    return sorted(names)
 
 
 def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
-    """Load a preset by bundled name or filesystem path.
+    """Load a preset by experiment name or filesystem path.
 
     Parameters
     ----------
     name_or_path : str
-        Either the name of a bundled preset (as returned by
-        :func:`list_bundled_presets`) or a path ending in ``.toml``.
+        Either the name of a preset in the top-level ``experiments`` directory
+        (as returned by :func:`list_presets`) or a path ending in ``.toml``.
     overrides : Iterable of str, optional
         ``KEY=VALUE`` strings as accepted by ``--set``. Applied before
         variable interpolation.
@@ -57,7 +63,7 @@ def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
     Raises
     ------
     FileNotFoundError
-        If ``name_or_path`` matches no bundled preset and no file on disk.
+        If ``name_or_path`` matches no experiment preset and no file on disk.
     KeyError
         If an override path begins with an unknown top-level key, or if a
         ``${VAR}`` reference cannot be resolved against the environment or
@@ -73,12 +79,12 @@ def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset:
 
 
 def _read_toml(name_or_path: str) -> dict[str, Any]:
-    """Read raw TOML from a filesystem path or a bundled package resource.
+    """Read raw TOML from a filesystem path or an experiment preset name.
 
     Parameters
     ----------
     name_or_path : str
-        Bundled preset name or filesystem path ending in ``.toml``.
+        Experiment preset name or filesystem path ending in ``.toml``.
 
     Returns
     -------
@@ -90,25 +96,103 @@ def _read_toml(name_or_path: str) -> dict[str, Any]:
     FileNotFoundError
         If neither location yields a TOML file.
     """
-    path = Path(name_or_path)
-    if path.suffix == ".toml" and path.exists():
+    path = _find_preset_path(name_or_path)
+    if path is not None:
         return tomllib.loads(path.read_text())
-    bundled = resources.files(_BUNDLED_PRESETS_PACKAGE) / f"{name_or_path}.toml"
-    if not bundled.is_file():
-        raise FileNotFoundError(
-            f"No preset {name_or_path!r}. Bundled: {list_bundled_presets()}. "
-            f"Or pass a path to a .toml file."
-        )
-    return tomllib.loads(bundled.read_text())
+    raise FileNotFoundError(
+        f"No preset {name_or_path!r}. Experiments: {list_presets()}. "
+        "Put TOML presets in ./experiments or pass a path to a .toml file."
+    )
+
+
+def _find_preset_path(name_or_path: str) -> Path | None:
+    """Resolve a preset name or path to a TOML file.
+
+    Parameters
+    ----------
+    name_or_path : str
+        Preset name (``full_8gpu``), TOML filename (``full_8gpu.toml``), or
+        filesystem path.
+
+    Returns
+    -------
+    pathlib.Path or None
+        Existing TOML path if found, otherwise ``None``.
+    """
+    path = Path(name_or_path)
+    if path.suffix == ".toml" and path.is_file():
+        return path
+
+    preset_filename = path.name if path.suffix == ".toml" else f"{name_or_path}.toml"
+    for directory in _experiment_dirs():
+        candidate = directory / preset_filename
+        if candidate.is_file():
+            return candidate
+    return None
+
+
+def _experiment_dirs() -> list[Path]:
+    """Return candidate top-level experiment directories in precedence order.
+
+    Returns
+    -------
+    list of pathlib.Path
+        Existing or candidate ``experiments`` directories. Duplicates are
+        removed while preserving order.
+    """
+    candidates: list[Path] = []
+
+    explicit = os.environ.get("SAMPLEWORKS_EXPERIMENTS_DIR")
+    if explicit:
+        candidates.append(Path(explicit))
+
+    source_dir = os.environ.get("SAMPLEWORKS_SOURCE_DIR")
+    if source_dir:
+        candidates.append(Path(source_dir) / _EXPERIMENTS_DIR_NAME)
+
+    candidates.append(Path("/home/dev/workspace") / _EXPERIMENTS_DIR_NAME)
+    candidates.extend(_find_upward_experiment_dirs(Path.cwd()))
+    candidates.extend(_find_upward_experiment_dirs(Path(__file__).resolve()))
+
+    seen: set[Path] = set()
+    unique: list[Path] = []
+    for candidate in candidates:
+        resolved = candidate.expanduser().resolve(strict=False)
+        if resolved not in seen:
+            seen.add(resolved)
+            unique.append(resolved)
+    return unique
+
+
+def _find_upward_experiment_dirs(start: Path) -> list[Path]:
+    """Search parents of ``start`` for top-level ``experiments`` directories.
+
+    Parameters
+    ----------
+    start : pathlib.Path
+        Directory or file path to begin searching from.
+
+    Returns
+    -------
+    list of pathlib.Path
+        Candidate experiment directories nearest to farthest.
+    """
+    current = start if start.is_dir() else start.parent
+    dirs: list[Path] = []
+    for parent in [current, *current.parents]:
+        candidate = parent / _EXPERIMENTS_DIR_NAME
+        if candidate.is_dir():
+            dirs.append(candidate)
+    return dirs
 
 
 def _preset_name(name_or_path: str) -> str:
-    """Return the canonical preset name for a bundled name or path argument.
+    """Return the canonical preset name for an experiment name or path argument.
 
     Parameters
     ----------
     name_or_path : str
-        Either a bundled name or a path ending in ``.toml``.
+        Either an experiment name or a path ending in ``.toml``.
 
     Returns
     -------
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index f21494f6..451c76b5 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -70,7 +70,7 @@ class Preset:
     Parameters
     ----------
     name : str
-        Identifier (matches the bundled TOML filename without the ``.toml``
+        Identifier (matches the experiment TOML filename without the ``.toml``
         suffix, or the stem of a user-supplied path).
     description : str
         Human-readable summary shown by ``--list`` and the launch banner.
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index adbe5432..c8e70f54 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -8,7 +8,7 @@
 from sampleworks.runs import cli
 
 
-def test_list_prints_all_bundled_presets(capsys: pytest.CaptureFixture[str]) -> None:
+def test_list_prints_all_experiment_presets(capsys: pytest.CaptureFixture[str]) -> None:
     exit_code = cli.main(["--list"])
     assert exit_code == 0
     out = capsys.readouterr().out.splitlines()
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 1e050572..c078615e 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -11,13 +11,13 @@
 BUNDLED = ["full_8gpu", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
 
 
-def test_list_bundled_presets_returns_the_five() -> None:
-    names = loader.list_bundled_presets()
-    assert set(names) == set(BUNDLED), f"unexpected bundled presets: {names}"
+def test_list_presets_returns_the_five() -> None:
+    names = loader.list_presets()
+    assert set(names) == set(BUNDLED), f"unexpected experiment presets: {names}"
 
 
 @pytest.mark.parametrize("name", BUNDLED)
-def test_each_bundled_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_each_experiment_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset(name)
     assert preset.name == name
@@ -116,6 +116,31 @@ def test_load_preset_from_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch)
     assert preset.defaults["DATA_DIR"] == "/x"
 
 
+def test_load_preset_from_experiments_dir_override(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Scientists can point the loader at a top-level experiments directory."""
+    experiments_dir = tmp_path / "experiments"
+    experiments_dir.mkdir()
+    (experiments_dir / "custom.toml").write_text(
+        'description = "custom"\n'
+        "[shared_args]\n"
+        'model = "rf3"\n'
+        "[[jobs]]\n"
+        'name = "j1"\n'
+        'env = "rf3"\n'
+        'gpus = "0"\n'
+        'output_subdir = "j1"\n'
+        "args = {}\n"
+    )
+    monkeypatch.setenv("SAMPLEWORKS_EXPERIMENTS_DIR", str(experiments_dir))
+
+    preset = loader.load_preset("custom")
+
+    assert preset.name == "custom"
+    assert preset.job("j1").env == "rf3"
+
+
 def test_unknown_preset_raises() -> None:
     with pytest.raises(FileNotFoundError):
         loader.load_preset("does_not_exist")

From b31f4424082cc1f2654c8c87a514545b6b5f5fab Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 13:30:35 -0400
Subject: [PATCH 19/28] fix(runs): clarify CUDA worker GPU logging

---
 run_grid_search.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/run_grid_search.py b/run_grid_search.py
index cfef17cf..29f2d792 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -211,7 +211,7 @@ def run_grid_search(
         for worker_num, job_queue_path in enumerate(job_queue_paths):
             model = worker_job_queues[worker_num][0].model
             future = executor.submit(
-                run_guidance_queue_script, (job_queue_path, max_workers, model, worker_num)
+                run_guidance_queue_script, (job_queue_path, model, worker_num, gpus)
             )
             futures[future] = job_queue_path
 
@@ -240,8 +240,16 @@ def run_grid_search(
     return results
 
 
-def run_guidance_queue_script(args: tuple[str, int, str, int]):
-    job_queue_path, max_workers, model, worker_num = args
+def run_guidance_queue_script(args: tuple[str, str, int, list[str]]):
+    """Run one pickled guidance job queue in the model's pixi environment.
+
+    Parameters
+    ----------
+    args : tuple of str, str, int, and list of str
+        Job queue path, model name, worker index, and selected GPU entries. CUDA remaps
+        selected entries such as ``4,5`` to local process indices ``0,1``.
+    """
+    job_queue_path, model, worker_num, gpus = args
     pixi_env = get_pixi_env(model)
     script_path = Path(__file__).parent / "scripts" / "run_guidance_pipeline.py"
     env_python = get_pixi_env_python(pixi_env)
@@ -260,7 +268,16 @@ def run_guidance_queue_script(args: tuple[str, int, str, int]):
             job_queue_path,
         ]
         env = os.environ.copy()
-    log.info(f"Running worker {worker_num}: {cmd} on GPU {worker_num % max_workers}")
+    local_gpu = worker_num % len(gpus)
+    requested_gpu = gpus[local_gpu]
+    if os.environ.get("CUDA_VISIBLE_DEVICES"):
+        gpu_source = "CUDA_VISIBLE_DEVICES"
+    else:
+        gpu_source = "GPU detection"
+    log.info(
+        f"Running worker {worker_num}: {cmd} on local CUDA GPU {local_gpu} "
+        f"(selected GPU {requested_gpu} via {gpu_source})"
+    )
 
     with open(job_queue_path.replace(".pkl", ".log"), "w") as log_file:
         result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)

From 47984834cb50d82d207c950716b9c0104e624786 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 13:45:22 -0400
Subject: [PATCH 20/28] docs(runs): improve docstring coverage

---
 run_grid_search.py                            | 15 +++++++++++++++
 src/sampleworks/runs/loader.py                |  1 +
 .../utils/guidance_script_arguments.py        | 19 ++++++++++++++++---
 .../utils/guidance_script_utils.py            | 10 ++++++++++
 tests/runs/test_cli.py                        |  8 ++++++++
 tests/runs/test_loader.py                     | 14 ++++++++++++++
 tests/runs/test_runner.py                     |  5 +++++
 7 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/run_grid_search.py b/run_grid_search.py
index 29f2d792..64fb2cf2 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -25,6 +25,8 @@
 
 @dataclass
 class GridSearchConfig:
+    """Serializable summary of the grid-search dimensions and output location."""
+
     model: str
     scalers: list[str]
     ensemble_sizes: list[int]
@@ -67,6 +69,12 @@ def get_job_status(job: JobConfig) -> str:
 
 
 def detect_gpus() -> list[str]:
+    """Return CUDA GPU identifiers visible to this grid-search process.
+
+    ``CUDA_VISIBLE_DEVICES`` wins when set because CUDA remaps those entries to
+    local process ordinals. Otherwise, ``nvidia-smi`` is used as a best-effort
+    discovery mechanism and ``["0"]`` is returned as a CPU/test fallback.
+    """
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if cuda_visible:
         gpus = [g.strip() for g in cuda_visible.split(",") if g.strip()]
@@ -105,6 +113,7 @@ def detect_gpus() -> list[str]:
 
 
 def get_pixi_env(model: str) -> str:
+    """Return the pixi environment name needed to run a model family."""
     if model in (StructurePredictor.BOLTZ_1, StructurePredictor.BOLTZ_2):
         return "boltz"
     elif model == StructurePredictor.PROTENIX:
@@ -119,6 +128,7 @@ def get_pixi_env(model: str) -> str:
 def build_args_for_process_pool(
     job: JobConfig, args: argparse.Namespace, device_num: int | None = None
 ) -> GuidanceConfig:
+    """Convert a grid-search job into the picklable guidance config for a worker."""
     guidance_config = GuidanceConfig(
         protein=job.protein,
         structure=job.structure_path,
@@ -384,6 +394,7 @@ def main(args: argparse.Namespace):
 
 
 def generate_jobs(args: argparse.Namespace) -> list[JobConfig]:
+    """Expand CLI grid dimensions into concrete per-protein guidance jobs."""
     jobs = []
 
     proteins = ProteinInput.from_csv(Path(args.proteins))
@@ -469,6 +480,7 @@ def save_results(
     output_dir: str,
     total_time: float,
 ):
+    """Merge the latest job results into ``results.json`` under ``output_dir``."""
     os.makedirs(output_dir, exist_ok=True)
     results_path = os.path.join(output_dir, "results.json")
 
@@ -527,6 +539,7 @@ def save_results(
 
 
 def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for one model-specific grid search."""
     parser = argparse.ArgumentParser(
         description="Run grid search across scalers, and parameters for a single "
         "protein structure predictor model."
@@ -663,6 +676,7 @@ def parse_args() -> argparse.Namespace:
 
 
 def log_args(args: argparse.Namespace, gpus: list[str]):
+    """Log the resolved grid-search configuration before jobs are generated."""
     log.info("=" * 50)
     log.info("Starting grid search")
     log.info(f"Model: {args.model}")
@@ -684,6 +698,7 @@ def log_args(args: argparse.Namespace, gpus: list[str]):
 # TODO make job statuses a proper class
 # TODO: there are many constants here like "not_run" that should be defined in only one place.
 def generate_and_filter_jobs(args: argparse.Namespace) -> tuple[list[JobConfig], dict[Any, Any]]:
+    """Generate jobs and filter them according to prior status and rerun flags."""
     jobs = generate_jobs(args)
     log.info(f"Generated {len(jobs)} total jobs")
 
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index 1adce888..1d541ac0 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -445,6 +445,7 @@ def _expand(text: str, env: dict[str, str]) -> str:
     """
 
     def repl(match: re.Match[str]) -> str:
+        """Return the configured value for one ``${VAR}`` interpolation match."""
         var = match.group(1)
         if var not in env:
             raise KeyError(f"Undefined variable ${{{var}}} in preset (no env var, no default)")
diff --git a/src/sampleworks/utils/guidance_script_arguments.py b/src/sampleworks/utils/guidance_script_arguments.py
index 2876c06e..29827add 100644
--- a/src/sampleworks/utils/guidance_script_arguments.py
+++ b/src/sampleworks/utils/guidance_script_arguments.py
@@ -45,9 +45,10 @@
 def _resolve_checkpoint(model_key: str) -> str:
     """Return the first checkpoint path that exists on disk for *model_key*.
 
-    Tries baked-in Docker paths first (``/checkpoints/``), then falls back to
-    legacy development paths.  If none are found the first candidate is returned
-    so that downstream validation produces a clear error message.
+    Model-specific environment variables from :data:`_CHECKPOINT_ENV_VARS` win
+    when set. Otherwise, candidates from :data:`_CHECKPOINT_CANDIDATES` are
+    tried in order, starting with baked-in ``/checkpoints/`` paths and then
+    ACTL shared-storage and legacy development locations.
     """
     env_var = _CHECKPOINT_ENV_VARS.get(model_key)
     candidates = []
@@ -364,6 +365,7 @@ def __post_init__(self):
             raise ValueError(f"Unknown model type: {self.model}")
 
     def populate_config_for_guidance_type(self, job: JobConfig, args: argparse.Namespace):
+        """Apply per-job grid-search values onto this guidance configuration."""
         checkpoint = get_checkpoint(args)
         if checkpoint is not None:
             self.model_checkpoint = checkpoint
@@ -407,6 +409,7 @@ def as_dict(self) -> dict[str, Any]:
 
 
 def add_generic_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments shared by all models and guidance methods."""
     parser.add_argument("--structure", type=str, required=True, help="Input structure")
     parser.add_argument("--density", type=str, required=True, help="Input density map")
     parser.add_argument("--output-dir", type=str, default="output", help="Output directory")
@@ -473,6 +476,7 @@ def add_generic_args(parser: argparse.ArgumentParser | GuidanceConfig):
 # Guidance type specific arguments
 ######################
 def add_pure_guidance_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to pure guidance sampling."""
     parser.add_argument("--step-size", type=float, default=0.1, help="Gradient step")
     parser.add_argument(
         "--step-scaler-type",
@@ -485,6 +489,7 @@ def add_pure_guidance_args(parser: argparse.ArgumentParser | GuidanceConfig):
 
 
 def add_fk_steering_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to Feynman-Kac steering."""
     parser.add_argument(
         "--num-particles",
         type=int,
@@ -527,6 +532,7 @@ def add_fk_steering_args(parser: argparse.ArgumentParser | GuidanceConfig):
 # Model specific arguments
 ###########
 def add_boltz2_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to Boltz2 guidance runs."""
     parser.add_argument(
         "--model-checkpoint",
         type=str,
@@ -542,6 +548,7 @@ def add_boltz2_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
 
 
 def add_protenix_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to Protenix guidance runs."""
     parser.add_argument(
         "--model-checkpoint",
         type=str,
@@ -551,6 +558,7 @@ def add_protenix_specific_args(parser: argparse.ArgumentParser | GuidanceConfig)
 
 
 def add_boltz1_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to Boltz1 guidance runs."""
     parser.add_argument(
         "--model-checkpoint",
         type=str,
@@ -560,6 +568,7 @@ def add_boltz1_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
 
 
 def add_rf3_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
+    """Add CLI arguments specific to RF3 guidance runs."""
     parser.add_argument(
         "--model-checkpoint",
         type=str,
@@ -599,6 +608,8 @@ def add_rf3_specific_args(parser: argparse.ArgumentParser | GuidanceConfig):
 
 @dataclass
 class JobConfig:
+    """Resolved inputs and grid-search settings for one guidance job."""
+
     protein: str
     structure_path: Path | str
     density_path: Path | str
@@ -615,6 +626,8 @@ class JobConfig:
 
 @dataclass
 class JobResult:
+    """Serializable status record produced after a guidance job finishes."""
+
     protein: str
     model: str
     method: str | None
diff --git a/src/sampleworks/utils/guidance_script_utils.py b/src/sampleworks/utils/guidance_script_utils.py
index 5d26a7c6..3832dfb2 100644
--- a/src/sampleworks/utils/guidance_script_utils.py
+++ b/src/sampleworks/utils/guidance_script_utils.py
@@ -71,6 +71,7 @@ def save_trajectory(
     subdir_name,
     save_every=10,
 ):
+    """Dispatch trajectory serialization to the handler for the selected scaler."""
     if scaler_type == GuidanceType.PURE_GUIDANCE:
         _save_trajectory(trajectory, atom_array, output_dir, subdir_name, save_every)
     elif scaler_type == GuidanceType.FK_STEERING:
@@ -100,6 +101,7 @@ def _write_coords_into_array(
 
 
 def _save_trajectory(trajectory, atom_array, output_dir, subdir_name, save_every):
+    """Save a pure-guidance coordinate trajectory as sampled multi-model CIFs."""
     output_dir = Path(output_dir / "trajectory" / subdir_name)
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -122,6 +124,7 @@ def _save_trajectory(trajectory, atom_array, output_dir, subdir_name, save_every
 
 
 def _save_fk_steering_trajectory(trajectory, atom_array, output_dir, subdir_name, save_every):
+    """Save the first-particle FK-steering trajectory as sampled multi-model CIFs."""
     output_dir = Path(output_dir / "trajectory" / subdir_name)
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -146,6 +149,7 @@ def _save_fk_steering_trajectory(trajectory, atom_array, output_dir, subdir_name
 
 
 def save_losses(losses, output_dir):
+    """Write per-step guidance losses to ``losses.txt`` in ``output_dir``."""
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -165,6 +169,7 @@ def get_model_and_device(
     method: str | None = None,
     model: Any = None,
 ) -> tuple[torch.device, Any]:
+    """Validate a checkpoint, choose a device, and construct the model wrapper."""
     validated_checkpoint_path = validate_model_checkpoint(model_type, model_checkpoint_path)
 
     device = torch.device(device_str) if device_str else try_gpu()
@@ -225,6 +230,7 @@ def get_reward_function_and_structure(
     resolution,
     structure_path: str | Path,
 ) -> tuple[RealSpaceRewardFunction, dict[str, Any]]:
+    """Load structure and density inputs and build the real-space reward function."""
     logger.debug(f"Loading structure from {structure_path}")
     safe_structure_path = resolve_mixed_hetatm_atom_altlocs(Path(structure_path))
     structure = parse(
@@ -418,6 +424,7 @@ def run_guidance(args: GuidanceConfig, guidance_type: str, model_wrapper, device
 
 # "guidance_type" is also called "scaler" in many places
 def _run_guidance(args: GuidanceConfig, guidance_type: str, model_wrapper, device):
+    """Run one configured guidance trajectory and save its outputs."""
     reward_function, structure = get_reward_function_and_structure(
         args.density,  # str/path to a map file.
         device,  # this needs to come from the global context, not the args object.
@@ -594,6 +601,7 @@ def _run_guidance(args: GuidanceConfig, guidance_type: str, model_wrapper, devic
 
 
 def epoch_seconds(time_to_convert: datetime) -> float:
+    """Convert a :class:`datetime.datetime` to seconds since the Unix epoch."""
     return (time_to_convert - datetime(1970, 1, 1)).total_seconds()
 
 
@@ -605,6 +613,7 @@ def get_job_result(
     exit_code: int,
     status: str,
 ) -> JobResult:
+    """Build the serializable result record for a completed guidance job."""
     start_time = epoch_seconds(started_at)
     end_time = epoch_seconds(ended_at)
     result = JobResult(
@@ -627,6 +636,7 @@ def get_job_result(
 
 
 def run_guidance_job_queue(job_queue_path: str) -> list[JobResult]:
+    """Load a pickled job queue, reuse one model wrapper, and run all jobs."""
     with open(job_queue_path, "rb") as fp:
         job_queue: list[GuidanceConfig] = pickle.load(fp)
 
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index c8e70f54..84b53af6 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -9,6 +9,7 @@
 
 
 def test_list_prints_all_experiment_presets(capsys: pytest.CaptureFixture[str]) -> None:
+    """``--list`` prints every bundled experiment preset exactly once."""
     exit_code = cli.main(["--list"])
     assert exit_code == 0
     out = capsys.readouterr().out.splitlines()
@@ -24,6 +25,7 @@ def test_list_prints_all_experiment_presets(capsys: pytest.CaptureFixture[str])
 def test_show_prints_resolved_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """``--show`` renders the resolved preset without launching jobs."""
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(["--preset", "rf3_partial", "--show"])
     assert exit_code == 0
@@ -35,6 +37,7 @@ def test_show_prints_resolved_preset(
 def test_dry_run_does_not_invoke_subprocess(
     monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """``--dry-run`` prints commands and CUDA assignment instead of executing."""
     monkeypatch.setenv("HOME", str(tmp_path))
     exit_code = cli.main([
         "--preset",
@@ -52,6 +55,7 @@ def test_dry_run_does_not_invoke_subprocess(
 def test_job_shortcut_filters_default_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """A positional job shortcut filters the default full_8gpu preset."""
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(["rf3,protenix", "--show"])
     assert exit_code == 0
@@ -66,6 +70,7 @@ def test_job_shortcut_filters_default_preset(
 def test_jobs_filters_explicit_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """``--jobs`` filters an explicitly selected preset by job name."""
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(["--preset", "full_8gpu", "--jobs", "rf3", "--show"])
     assert exit_code == 0
@@ -76,6 +81,7 @@ def test_jobs_filters_explicit_preset(
 
 
 def test_job_shortcut_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Unknown positional job shortcuts fail with a clear selector error."""
     monkeypatch.setenv("HOME", "/home/test")
     with pytest.raises(SystemExit, match="unknown jobs"):
         cli.main(["nonexistent", "--show"])
@@ -84,6 +90,7 @@ def test_job_shortcut_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -
 def test_set_override_propagates_through_cli(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """``--set`` overrides are applied before the preset is displayed."""
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(
         [
@@ -102,6 +109,7 @@ def test_set_override_propagates_through_cli(
 def test_no_target_defaults_to_full_8gpu(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
+    """Running without a target resolves to the flagship full_8gpu preset."""
     monkeypatch.setenv("HOME", "/home/test")
     exit_code = cli.main(["--show"])
     assert exit_code == 0
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index c078615e..0e8d47cb 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -12,12 +12,14 @@
 
 
 def test_list_presets_returns_the_five() -> None:
+    """Preset discovery returns the expected bundled experiment names."""
     names = loader.list_presets()
     assert set(names) == set(BUNDLED), f"unexpected experiment presets: {names}"
 
 
 @pytest.mark.parametrize("name", BUNDLED)
 def test_each_experiment_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Every bundled preset loads into jobs with supported pixi environments."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset(name)
     assert preset.name == name
@@ -27,6 +29,7 @@ def test_each_experiment_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch
 
 
 def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Environment variables override preset defaults during interpolation."""
     monkeypatch.setenv("HOME", "/home/test")
     monkeypatch.setenv("DATA_DIR", "/from/env")
     preset = loader.load_preset("rf3_partial")
@@ -37,6 +40,7 @@ def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> No
 
 
 def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Preset defaults fill in interpolation variables absent from the environment."""
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial")
@@ -53,6 +57,7 @@ def test_full_8gpu_uses_canonical_inputs_dir(monkeypatch: pytest.MonkeyPatch) ->
 
 
 def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
+    """``--set defaults.*`` overrides participate in later interpolation."""
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial", overrides=["defaults.DATA_DIR=/custom"])
@@ -61,18 +66,21 @@ def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
 
 
 def test_set_override_at_job_by_name(monkeypatch: pytest.MonkeyPatch) -> None:
+    """``--set jobs.<name>.*`` updates the named job."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("full_8gpu", overrides=["jobs.rf3.gpus=7"])
     assert preset.job("rf3").gpus == "7"
 
 
 def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None:
+    """``--set jobs.<index>.*`` updates the indexed job."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("full_8gpu", overrides=["jobs.0.gpus=9"])
     assert preset.jobs[0].gpus == "9"
 
 
 def test_set_override_at_args_inside_job(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Dotted overrides can create or replace per-job CLI args."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset(
         "rf3_partial", overrides=["jobs.rf3.args.gradient-weights=0.0 0.01"]
@@ -81,6 +89,7 @@ def test_set_override_at_args_inside_job(monkeypatch: pytest.MonkeyPatch) -> Non
 
 
 def test_set_coerces_bool_and_int(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Override values are coerced to bools and ints when unambiguous."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset(
         "rf3_partial",
@@ -96,6 +105,7 @@ def test_set_coerces_bool_and_int(monkeypatch: pytest.MonkeyPatch) -> None:
 
 
 def test_load_preset_from_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    """A filesystem TOML path loads as a custom preset."""
     monkeypatch.setenv("HOME", "/home/test")
     custom = tmp_path / "mycustom.toml"
     custom.write_text(
@@ -142,11 +152,13 @@ def test_load_preset_from_experiments_dir_override(
 
 
 def test_unknown_preset_raises() -> None:
+    """Missing preset names raise ``FileNotFoundError``."""
     with pytest.raises(FileNotFoundError):
         loader.load_preset("does_not_exist")
 
 
 def test_undefined_variable_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    """Unresolved ``${VAR}`` references fail instead of expanding to empty strings."""
     bad = tmp_path / "bad.toml"
     bad.write_text(
         '[shared_args]\nproteins = "${NEVER_DEFINED_VAR}/x"\n'
@@ -158,6 +170,7 @@ def test_undefined_variable_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Pa
 
 
 def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Malformed override specs must contain a ``KEY=VALUE`` separator."""
     monkeypatch.setenv("HOME", "/home/test")
     with pytest.raises(ValueError, match="KEY=VALUE"):
         loader.load_preset("rf3_partial", overrides=["bogus_no_equals"])
@@ -171,6 +184,7 @@ def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch)
 
 
 def test_bad_env_rejected(tmp_path: Path) -> None:
+    """Preset jobs reject unsupported pixi environment names."""
     bad = tmp_path / "bad.toml"
     bad.write_text(
         '[[jobs]]\nname = "j"\nenv = "not_a_real_env"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n'
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 9305368d..5eec8946 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -40,6 +40,7 @@ def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> N
 
 
 def test_argv_omits_false_bool_flags(monkeypatch: pytest.MonkeyPatch) -> None:
+    """False boolean args are omitted rather than emitted as bare CLI flags."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset(
         "rf3_partial", overrides=["shared_args.gradient-normalization=false"]
@@ -51,6 +52,7 @@ def test_argv_omits_false_bool_flags(monkeypatch: pytest.MonkeyPatch) -> None:
 def test_explicit_output_dir_in_args_wins_over_subdir_default(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
+    """An explicit per-job output-dir beats the output_subdir-derived default."""
     monkeypatch.setenv("HOME", "/home/test")
     custom = tmp_path / "custom.toml"
     custom.write_text(
@@ -67,6 +69,7 @@ def test_explicit_output_dir_in_args_wins_over_subdir_default(
 def test_full_8gpu_has_four_jobs_with_distinct_gpus(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
+    """The full_8gpu preset maps its four jobs onto distinct GPU pairs."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("full_8gpu")
     invocations = runner.build_invocations(preset, results_dir=Path("/r"))
@@ -76,6 +79,7 @@ def test_full_8gpu_has_four_jobs_with_distinct_gpus(
 
 
 def test_protenix_dual_uses_different_checkpoints(monkeypatch: pytest.MonkeyPatch) -> None:
+    """The Protenix dual preset uses separate tiny and mini checkpoints."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("protenix_dual")
     invocations = runner.build_invocations(preset, results_dir=Path("/r"))
@@ -85,6 +89,7 @@ def test_protenix_dual_uses_different_checkpoints(monkeypatch: pytest.MonkeyPatc
 
 
 def test_rf3_partial_chiral_off_flag_present(monkeypatch: pytest.MonkeyPatch) -> None:
+    """The RF3 chiral-off preset passes the disable and force rerun flags."""
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("rf3_partial_chiral_off")
     inv = runner.build_invocations(preset, results_dir=Path("/r"))[0]

From 57f3beceb44da3d751cd16d8afb61dd2d10f50de Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 13:52:56 -0400
Subject: [PATCH 21/28] docs(runs): add ACTL run_experiments how-to

---
 README.md | 68 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 7cc29b05..877c06de 100644
--- a/README.md
+++ b/README.md
@@ -152,49 +152,65 @@ Output layout: `grid_search_results/<protein>/<model>[_<method>]/<scaler>/ens<N>
 Instructions for running evaluation and metrics scripts are coming soon.
 
 
-## ACTL preset experiments (`run_experiments`)
+## Running preset experiments on ACTL (`run_experiments`)
 
-Use ACTL to get one 8-GPU pod with baked pixi environments, checkpoints, the
-shared data PVC, and your local checkout synced to `/home/dev/workspace`:
+Start an 8-GPU ACTL machine named `sampleworks` with the Sampleworks image and
+the shared data volume mounted:
 
 ```bash
-actl pod up sampleworks-pr240 --fresh --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+actl pod up sampleworks --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
 ```
 
-Keep that `actl pod up` terminal open so sync/SSH stays alive. In another
-terminal, copy the `ssh:` line from `actl pod status sampleworks-pr240`, then:
+Keep that terminal open; it maintains sync and SSH. From another terminal:
 
 ```bash
-ssh workspace.actl-ws-<user>-sampleworks-pr240.devspace
+actl pod status sampleworks
+# copy the `ssh:` line, then run it, for example:
+ssh workspace.actl-ws-<user>-sampleworks.devspace
 cd /home/dev/workspace
-run_experiments --dry-run
-run_experiments
-run_experiments rf3
 ```
 
-`run_experiments` is the entrypoint. It uses the synced source tree from
-`/home/dev/workspace`, but runs the baked interpreters from
-`/app/.pixi/envs/{boltz,protenix,rf3}/bin/python` directly. It should not run
-`pixi install` or pull packages at runtime; if an env is missing, recreate the
-pod with the current `pixi-with-checkpoints` image.
+The main command is `run_experiments`. It reads TOML presets and launches the
+right `run_grid_search.py` jobs, pixi environments, GPU assignments, logs,
+results directory, and MSA cache.
 
-`run_experiments` reads TOML presets and launches the requested
-`run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job.
-The default preset is `full_8gpu`, which splits GPUs across Boltz2 XRD, Boltz2
-MD, RF3, and Protenix. A positional target like `rf3` or `rf3,protenix` runs
-those jobs from `full_8gpu`; use `--preset rf3_partial` for a specific preset.
+```bash
+run_experiments --list        # show available presets
+run_experiments --show rf3    # inspect what will run
+run_experiments --dry-run rf3 # print commands without running
+run_experiments rf3           # run only the RF3 job from full_8gpu
+run_experiments               # run the default full_8gpu preset
+```
 
-Presets live in the synced repo at `experiments/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch:
+The default `full_8gpu` preset runs Boltz2 XRD, Boltz2 MD, RF3, and Protenix in
+parallel. Run a subset with:
 
 ```bash
-run_experiments rf3,protenix
-run_experiments --preset rf3_partial --set jobs.rf3.gpus=0
+run_experiments full_8gpu --jobs rf3,protenix
 ```
 
-On smaller pods, make sure preset GPU IDs only reference visible pod GPUs
-(`0..N-1`). `run_experiments` fails fast if a preset requests unavailable GPUs.
+Presets live in `experiments/*.toml` in your local checkout and on the pod at
+`/home/dev/workspace/experiments/*.toml`. To modify an experiment, edit or copy
+a preset locally, let ACTL sync it, then run it by name or path:
+
+```bash
+cp experiments/rf3_partial.toml experiments/my_rf3.toml
+# edit experiments/my_rf3.toml locally
+run_experiments --preset my_rf3
+```
+
+For one-off changes, use `--set` instead of editing TOML:
+
+```bash
+run_experiments rf3 --set jobs.rf3.gpus=0,1
+run_experiments rf3 --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
+```
 
-The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations.
+Defaults: inputs come from `/mnt/diffuse-shared/raw/sampleworks/...`, checkpoints
+from `/mnt/diffuse-shared/raw/checkpoints`, results go to
+`/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`, and MSA caches go to
+`/mnt/diffuse-shared/cache/sampleworks/msa`. Override with `DATA_DIR`,
+`RESULTS_DIR`, or `MSA_CACHE_DIR` before running.
 
 
 ## Docker

From bb65db4a753a716f608b543b05f96c554ab38bc2 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 14:04:23 -0400
Subject: [PATCH 22/28] fix(runs): address lint and runner review issues

---
 Dockerfile                     |  2 +-
 run_experiments                | 27 +++++++++------
 run_grid_search.py             | 20 ++++++++----
 src/sampleworks/runs/cli.py    |  6 +++-
 src/sampleworks/runs/loader.py | 32 +++++++++++++-----
 src/sampleworks/runs/runner.py | 60 ++++++++++++++++++++++++----------
 src/sampleworks/runs/schema.py |  5 +--
 tests/runs/conftest.py         |  7 ++++
 tests/runs/test_cli.py         | 16 +++++----
 tests/runs/test_loader.py      | 21 ++++++++++++
 10 files changed, 142 insertions(+), 54 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b45067b9..439421e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -105,7 +105,7 @@ COPY docker-entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh
 
 # ============================================================================
-# Bake in model checkpoints from pre-built base image on Docker Hub
+# Bake in model checkpoints from pre-built Harbor image
 # ============================================================================
 # Checkpoints (~10 GB) rarely change, so this layer is placed before pixi
 # installs to stay cached even when dependencies update.
diff --git a/run_experiments b/run_experiments
index 88adf215..84e831e5 100755
--- a/run_experiments
+++ b/run_experiments
@@ -39,6 +39,13 @@ find_sampleworks_root_upwards() {
 resolve_repo_root() {
     local source_override="${SAMPLEWORKS_SOURCE_DIR:-}"
     if [[ -n "$source_override" ]]; then
+        if ! is_sampleworks_root "$source_override"; then
+            cat >&2 <<EOF
+SAMPLEWORKS_SOURCE_DIR does not point to a Sampleworks checkout:
+  $source_override
+EOF
+            return 2
+        fi
         printf '%s\n' "$source_override"
         return 0
     fi
@@ -191,8 +198,17 @@ for checkpoint_var_and_file in \
     fi
 done
 
+needs_runtime_paths=1
+for arg in "$@"; do
+    case "$arg" in
+        --dry-run|--show|--list|-h|--help)
+            needs_runtime_paths=0
+            ;;
+    esac
+done
+
 source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
-if [[ -f "$source_proteins_csv" ]]; then
+if [[ "$needs_runtime_paths" -eq 1 && -f "$source_proteins_csv" ]]; then
     # The shared proteins.csv currently contains absolute /data/inputs paths,
     # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
     # manifest instead of requiring non-root scientists to create /data symlinks.
@@ -227,15 +243,6 @@ if [[ -n "$explicit_jobs" ]]; then
     display_target="$display_target --jobs $explicit_jobs"
 fi
 
-needs_runtime_paths=1
-for arg in "$@"; do
-    case "$arg" in
-        --dry-run|--show|--list|-h|--help)
-            needs_runtime_paths=0
-            ;;
-    esac
-done
-
 if [[ "$needs_runtime_paths" -eq 1 ]]; then
     if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then
         cat >&2 <<EOF
diff --git a/run_grid_search.py b/run_grid_search.py
index 64fb2cf2..bc4c3173 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -72,11 +72,15 @@ def detect_gpus() -> list[str]:
     """Return CUDA GPU identifiers visible to this grid-search process.
 
     ``CUDA_VISIBLE_DEVICES`` wins when set because CUDA remaps those entries to
-    local process ordinals. Otherwise, ``nvidia-smi`` is used as a best-effort
-    discovery mechanism and ``["0"]`` is returned as a CPU/test fallback.
+    local process ordinals. Explicit CUDA "no device" sentinel values return an
+    empty list. Otherwise, ``nvidia-smi`` is used as a best-effort discovery
+    mechanism and ``["0"]`` is returned as a CPU/test fallback.
     """
-    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
-    if cuda_visible:
+    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+    cuda_visible_key = cuda_visible.lower()
+    if cuda_visible_key in {"none", "void", "nodevfiles"}:
+        return []
+    if cuda_visible and cuda_visible_key != "all":
         gpus = [g.strip() for g in cuda_visible.split(",") if g.strip()]
         try:
             result = subprocess.run(
@@ -85,9 +89,7 @@ def detect_gpus() -> list[str]:
                 text=True,
             )
             if result.returncode == 0:
-                visible = [
-                    g.strip() for g in result.stdout.strip().split("\n") if g.strip()
-                ]
+                visible = [g.strip() for g in result.stdout.strip().split("\n") if g.strip()]
                 if all(g.isdigit() for g in gpus + visible):
                     missing = sorted(set(gpus).difference(visible), key=int)
                     if missing:
@@ -361,6 +363,10 @@ def main(args: argparse.Namespace):
     log.info(f"Detected {len(gpus)} GPUs: {gpus}")
     if args.max_parallel != "auto":
         gpus = gpus[: int(args.max_parallel)]
+    if not gpus:
+        raise ValueError(
+            "No CUDA GPUs are visible; unset CUDA_VISIBLE_DEVICES=none or use a GPU pod"
+        )
 
     log_args(args, gpus)
 
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index b0b94c85..9a5257e0 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -84,7 +84,11 @@ def _build_parser() -> argparse.ArgumentParser:
         default="",
         help="Preset name from experiments/ or path to a .toml file. Default: full_8gpu.",
     )
-    parser.add_argument("--list", action="store_true", help="List experiments/*.toml presets and exit")
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List experiments/*.toml presets and exit",
+    )
     parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit")
     parser.add_argument(
         "--dry-run",
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index 1d541ac0..d5f88e4d 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -1,9 +1,11 @@
 """Load presets from TOML and apply runtime overrides.
 
 Resolution order for every string value (defaults block and ``args``):
-  1. ``${VAR}`` references are resolved against the process environment,
+  1. ``--set <dotted-path>=<value>`` CLI overrides are applied to the raw TOML
+     dict by :func:`load_preset`, so overridden values participate in
+     interpolation.
+  2. ``${VAR}`` references are resolved against the process environment,
      with the preset's ``[defaults]`` block filling in any unset keys.
-  2. ``--set <dotted-path>=<value>`` CLI overrides are applied last.
 """
 
 from __future__ import annotations
@@ -19,6 +21,7 @@
 
 
 _EXPERIMENTS_DIR_NAME = "experiments"
+_MAX_EXPAND_ITERATIONS = 32
 _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
 _TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
 
@@ -325,10 +328,15 @@ def _find_in_list(items: list[Any], key: str, *, where: str) -> int:
     Raises
     ------
     KeyError
-        If no element with the given name exists.
+        If no element with the given name or index exists.
     """
     if key.isdigit() or (key.startswith("-") and key[1:].isdigit()):
-        return int(key)
+        index = int(key)
+        try:
+            items[index]
+        except IndexError:
+            raise KeyError(f"No list element at index {index} at {where!r}") from None
+        return index
     for i, item in enumerate(items):
         if isinstance(item, dict) and item.get("name") == key:
             return i
@@ -442,6 +450,8 @@ def _expand(text: str, env: dict[str, str]) -> str:
     ------
     KeyError
         If a referenced variable is not in ``env``.
+    ValueError
+        If recursive variable interpolation does not converge.
     """
 
     def repl(match: re.Match[str]) -> str:
@@ -451,12 +461,16 @@ def repl(match: re.Match[str]) -> str:
             raise KeyError(f"Undefined variable ${{{var}}} in preset (no env var, no default)")
         return env[var]
 
-    prev = None
     current = text
-    while prev != current:
-        prev = current
-        current = _VAR_PATTERN.sub(repl, current)
-    return current
+    for _ in range(_MAX_EXPAND_ITERATIONS):
+        expanded = _VAR_PATTERN.sub(repl, current)
+        if expanded == current:
+            return expanded
+        current = expanded
+    raise ValueError(
+        f"Variable expansion did not converge for {text!r}; check for circular "
+        "${VAR} references in [defaults], environment variables, or --set overrides."
+    )
 
 
 def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 832d93b7..6d83b231 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -17,6 +17,8 @@
 
 DEFAULT_GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
 WORKSPACE_GRID_SEARCH_SCRIPT = "/home/dev/workspace/run_grid_search.py"
+PROCESS_SHUTDOWN_TIMEOUT_SECONDS = 10
+TEE_THREAD_JOIN_TIMEOUT_SECONDS = 5
 
 
 @dataclass(frozen=True)
@@ -172,9 +174,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
         return
 
     available_set = set(available)
-    unavailable = {
-        gpu: names for gpu, names in requested.items() if gpu not in available_set
-    }
+    unavailable = {gpu: names for gpu, names in requested.items() if gpu not in available_set}
     if unavailable:
         details = ", ".join(
             f"GPU {gpu} requested by {', '.join(names)}"
@@ -413,6 +413,7 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
     int
         ``0`` if all jobs exited 0 (or ``dry_run`` was set), ``1`` otherwise.
     """
+    results_dir = results_dir.resolve()
     results_dir.mkdir(parents=True, exist_ok=True)
     invocations = build_invocations(preset, results_dir=results_dir)
     _validate_gpu_assignments(invocations)
@@ -444,23 +445,33 @@ def _terminate_all(jobs: list[_RunningJob]) -> None:
     Parameters
     ----------
     jobs : list of _RunningJob
-        Jobs whose subprocesses should be SIGTERM'd, waited on, and whose tee
-        threads should be joined.
+        Jobs whose subprocesses should be SIGTERM'd, escalated to SIGKILL if
+        needed, and whose tee threads should be joined with bounded waits.
     """
     for j in jobs:
         if j.proc.poll() is None:
             j.proc.terminate()
     for j in jobs:
-        j.proc.wait()
-        j.tee_thread.join()
+        try:
+            j.proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+        except subprocess.TimeoutExpired:
+            j.proc.kill()
+            try:
+                j.proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+            except subprocess.TimeoutExpired:
+                print(
+                    f"[{_ts()}] {j.inv.job.name} did not exit after SIGKILL",
+                    file=sys.stderr,
+                )
+        j.tee_thread.join(timeout=TEE_THREAD_JOIN_TIMEOUT_SECONDS)
 
 
 def _prepare_pixi_env(pixi_env: str) -> None:
     """Prepare a pixi environment before parallel job launch.
 
-    ``pixi run`` is deliberately called once per env even when the interpreter
-    directory already exists, because pixi may still need to materialize PyPI
-    packages into that environment after image startup.
+    Preparation is skipped when a baked interpreter is already available, when
+    prebuilt environments are required, or when ``SAMPLEWORKS_SKIP_ENV_PREPARE``
+    is truthy. Otherwise, ``pixi run`` is called once for the environment.
 
     Parameters
     ----------
@@ -576,24 +587,39 @@ def _spawn(inv: JobInvocation) -> _RunningJob:
     inv.log_path.parent.mkdir(parents=True, exist_ok=True)
     inv.output_dir.mkdir(parents=True, exist_ok=True)
     log_file = open(inv.log_path, "wb")
+    proc: subprocess.Popen[bytes] | None = None
+    thread: threading.Thread | None = None
     try:
         proc = subprocess.Popen(
             inv.argv,
             env=inv.env,
+            cwd=str(_pixi_project_dir()),
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             bufsize=0,
         )
+        if proc.stdout is None:
+            raise RuntimeError(f"Job {inv.job.name!r} started without a stdout pipe")
+        thread = threading.Thread(
+            target=_tee,
+            args=(inv.job.name, proc.stdout, log_file),
+            daemon=True,
+        )
+        thread.start()
     except BaseException:
         log_file.close()
+        if proc is not None and proc.poll() is None:
+            proc.kill()
+            try:
+                proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+            except subprocess.TimeoutExpired:
+                print(
+                    f"[{_ts()}] {inv.job.name} did not exit after failed spawn cleanup",
+                    file=sys.stderr,
+                )
         raise
-    assert proc.stdout is not None
-    thread = threading.Thread(
-        target=_tee,
-        args=(inv.job.name, proc.stdout, log_file),
-        daemon=True,
-    )
-    thread.start()
+    if proc is None or thread is None:
+        raise RuntimeError(f"Job {inv.job.name!r} failed to initialize")
     print(f"[{_ts()}] launched {inv.job.name} (pid {proc.pid})", file=sys.stderr)
     return _RunningJob(inv=inv, proc=proc, tee_thread=thread)
 
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index 451c76b5..37b64e9c 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -1,8 +1,9 @@
 """Dataclasses for the preset schema.
 
 A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
-is launched in its configured model environment with ``CUDA_VISIBLE_DEVICES``
-set to the job's GPU assignment.
+runs in its configured model environment, either through ``pixi run`` or a
+baked environment Python, with ``CUDA_VISIBLE_DEVICES`` set to the job's GPU
+assignment.
 """
 
 from __future__ import annotations
diff --git a/tests/runs/conftest.py b/tests/runs/conftest.py
index a20482c0..70cc7919 100644
--- a/tests/runs/conftest.py
+++ b/tests/runs/conftest.py
@@ -2,10 +2,17 @@
 
 from __future__ import annotations
 
+import os
+
 import pytest
 
 
 @pytest.fixture(autouse=True)
 def force_pixi_argv(monkeypatch: pytest.MonkeyPatch) -> None:
     """Keep argv assertions deterministic on machines with /app/.pixi present."""
+    monkeypatch.delenv("SAMPLEWORKS_GRID_SEARCH_SCRIPT", raising=False)
+    monkeypatch.delenv("SAMPLEWORKS_PIXI_PROJECT_DIR", raising=False)
+    for var in list(os.environ):
+        if var.startswith("SAMPLEWORKS_") and var.endswith("_PYTHON"):
+            monkeypatch.delenv(var, raising=False)
     monkeypatch.setenv("SAMPLEWORKS_FORCE_PIXI", "1")
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index 84b53af6..163fdcd1 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -39,13 +39,15 @@ def test_dry_run_does_not_invoke_subprocess(
 ) -> None:
     """``--dry-run`` prints commands and CUDA assignment instead of executing."""
     monkeypatch.setenv("HOME", str(tmp_path))
-    exit_code = cli.main([
-        "--preset",
-        "rf3_partial",
-        "--dry-run",
-        "--results-dir",
-        str(tmp_path),
-    ])
+    exit_code = cli.main(
+        [
+            "--preset",
+            "rf3_partial",
+            "--dry-run",
+            "--results-dir",
+            str(tmp_path),
+        ]
+    )
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "pixi run -e rf3 python /app/run_grid_search.py" in out
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 0e8d47cb..e8b6cf45 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -183,6 +183,27 @@ def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch)
         loader.load_preset("rf3_partial", overrides=["job.rf3.gpus=0"])
 
 
+def test_set_with_out_of_range_job_index_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Out-of-range list indices in overrides fail with a clear ``KeyError``."""
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(KeyError, match="index 99"):
+        loader.load_preset("rf3_partial", overrides=["jobs.99.gpus=0"])
+
+
+def test_cyclic_variable_expansion_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    """Cyclic ``${VAR}`` references fail fast instead of looping forever."""
+    bad = tmp_path / "cycle.toml"
+    bad.write_text(
+        "[shared_args]\n"
+        'proteins = "${A}"\n'
+        '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n'
+    )
+    monkeypatch.setenv("A", "${B}")
+    monkeypatch.setenv("B", "${A}")
+    with pytest.raises(ValueError, match="did not converge"):
+        loader.load_preset(str(bad))
+
+
 def test_bad_env_rejected(tmp_path: Path) -> None:
     """Preset jobs reject unsupported pixi environment names."""
     bad = tmp_path / "bad.toml"

From e99e5f2cfee26c4a54d5f0baf2ca2f1a77ba635e Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 14:16:28 -0400
Subject: [PATCH 23/28] feat(runs): add standalone model presets

---
 README.md                   | 10 +++++++++-
 experiments/boltz.toml      | 30 ++++++++++++++++++++++++++++++
 experiments/boltz1.toml     | 27 +++++++++++++++++++++++++++
 experiments/boltz2.toml     | 30 ++++++++++++++++++++++++++++++
 experiments/boltz2_md.toml  | 26 ++++++++++++++++++++++++++
 experiments/boltz2_xrd.toml | 26 ++++++++++++++++++++++++++
 experiments/protenix.toml   | 25 +++++++++++++++++++++++++
 experiments/rf3.toml        | 27 +++++++++++++++++++++++++++
 src/sampleworks/runs/cli.py | 12 ++++++++----
 tests/runs/test_cli.py      | 34 +++++++++++++++++++++++++++++++++-
 tests/runs/test_loader.py   | 21 +++++++++++++++++----
 11 files changed, 258 insertions(+), 10 deletions(-)
 create mode 100644 experiments/boltz.toml
 create mode 100644 experiments/boltz1.toml
 create mode 100644 experiments/boltz2.toml
 create mode 100644 experiments/boltz2_md.toml
 create mode 100644 experiments/boltz2_xrd.toml
 create mode 100644 experiments/protenix.toml
 create mode 100644 experiments/rf3.toml

diff --git a/README.md b/README.md
index 877c06de..3f107659 100644
--- a/README.md
+++ b/README.md
@@ -178,7 +178,10 @@ results directory, and MSA cache.
 run_experiments --list        # show available presets
 run_experiments --show rf3    # inspect what will run
 run_experiments --dry-run rf3 # print commands without running
-run_experiments rf3           # run only the RF3 job from full_8gpu
+run_experiments rf3           # run the standalone RF3 preset
+run_experiments boltz         # run Boltz2 X-ray + Boltz2 MD
+run_experiments boltz1        # run standalone Boltz1
+run_experiments protenix      # run the standalone Protenix preset
 run_experiments               # run the default full_8gpu preset
 ```
 
@@ -189,6 +192,11 @@ parallel. Run a subset with:
 run_experiments full_8gpu --jobs rf3,protenix
 ```
 
+Standalone presets are available for each model/model family: `boltz`,
+`boltz1`, `boltz2`, `boltz2_xrd`, `boltz2_md`, `rf3`, and `protenix`.
+Additional comparison presets include `protenix_dual`, `rf3_protenix`, and RF3
+variants.
+
 Presets live in `experiments/*.toml` in your local checkout and on the pod at
 `/home/dev/workspace/experiments/*.toml`. To modify an experiment, edit or copy
 a preset locally, let ACTL sync it, then run it by name or path:
diff --git a/experiments/boltz.toml b/experiments/boltz.toml
new file mode 100644
index 00000000..b7addab7
--- /dev/null
+++ b/experiments/boltz.toml
@@ -0,0 +1,30 @@
+description = "Boltz2 X-ray and MD canonical occ-sweep jobs."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/boltz"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz2_xrd"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz2_xrd"
+args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
+
+[[jobs]]
+name = "boltz2_md"
+env = "boltz"
+gpus = "2,3"
+output_subdir = "boltz2_md"
+args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz1.toml b/experiments/boltz1.toml
new file mode 100644
index 00000000..26df3bf5
--- /dev/null
+++ b/experiments/boltz1.toml
@@ -0,0 +1,27 @@
+description = "Boltz1 canonical occ-sweep job."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/boltz1"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+BOLTZ1_CHECKPOINT = "/checkpoints/boltz1_conf.ckpt"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "boltz1"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5"
+model-checkpoint = "${BOLTZ1_CHECKPOINT}"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz1"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz1"
+args = {}
diff --git a/experiments/boltz2.toml b/experiments/boltz2.toml
new file mode 100644
index 00000000..5f63265a
--- /dev/null
+++ b/experiments/boltz2.toml
@@ -0,0 +1,30 @@
+description = "Boltz2 X-ray and MD canonical occ-sweep jobs."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/boltz2"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz2_xrd"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz2_xrd"
+args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
+
+[[jobs]]
+name = "boltz2_md"
+env = "boltz"
+gpus = "2,3"
+output_subdir = "boltz2_md"
+args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz2_md.toml b/experiments/boltz2_md.toml
new file mode 100644
index 00000000..a175050a
--- /dev/null
+++ b/experiments/boltz2_md.toml
@@ -0,0 +1,26 @@
+description = "Boltz2 MD canonical occ-sweep job."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/boltz2_md"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "boltz2"
+method = "MD"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz2_md"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz2_md"
+args = {}
diff --git a/experiments/boltz2_xrd.toml b/experiments/boltz2_xrd.toml
new file mode 100644
index 00000000..9fc0c48c
--- /dev/null
+++ b/experiments/boltz2_xrd.toml
@@ -0,0 +1,26 @@
+description = "Boltz2 X-ray canonical occ-sweep job."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/boltz2_xrd"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "boltz2"
+method = "X-RAY DIFFRACTION"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "boltz2_xrd"
+env = "boltz"
+gpus = "0,1"
+output_subdir = "boltz2_xrd"
+args = {}
diff --git a/experiments/protenix.toml b/experiments/protenix.toml
new file mode 100644
index 00000000..383daf97
--- /dev/null
+++ b/experiments/protenix.toml
@@ -0,0 +1,25 @@
+description = "Protenix canonical occ-sweep job."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/protenix"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "protenix"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "protenix"
+env = "protenix"
+gpus = "0,1"
+output_subdir = "protenix"
+args = {}
diff --git a/experiments/rf3.toml b/experiments/rf3.toml
new file mode 100644
index 00000000..40a2f559
--- /dev/null
+++ b/experiments/rf3.toml
@@ -0,0 +1,27 @@
+description = "RF3 canonical occ-sweep job."
+
+[defaults]
+DATA_DIR = "/data/inputs"
+RESULTS_DIR = "/data/results/rf3"
+MSA_CACHE_DIR = "/root/.sampleworks"
+PROTEINS_CSV = "${DATA_DIR}/proteins.csv"
+RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt"
+
+[shared_args]
+proteins = "${PROTEINS_CSV}"
+model = "rf3"
+scalers = "pure_guidance"
+partial-diffusion-step = 120
+ensemble-sizes = "8"
+gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1"
+model-checkpoint = "${RF3_CHECKPOINT}"
+gradient-normalization = true
+augmentation = true
+align-to-input = true
+
+[[jobs]]
+name = "rf3"
+env = "rf3"
+gpus = "0,1"
+output_subdir = "rf3"
+args = {}
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 9a5257e0..94055a19 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -67,16 +67,17 @@ def _build_parser() -> argparse.ArgumentParser:
         prog="sampleworks-runs",
         description=(
             "Run Sampleworks experiment presets. With no target, runs the "
-            "full_8gpu preset. A target like 'rf3' or 'rf3,protenix' is a "
-            "job shortcut from full_8gpu; use --preset for another TOML preset."
+            "full_8gpu preset. A target like 'rf3', 'boltz', or 'protenix' "
+            "runs that preset; comma-separated targets like 'rf3,protenix' "
+            "select jobs from full_8gpu."
         ),
     )
     parser.add_argument(
         "target",
         nargs="?",
         help=(
-            "Job shortcut from full_8gpu (rf3, protenix, boltz2_xrd, "
-            "boltz2_md, or comma-separated), or 'full'/'full_8gpu'."
+            "Preset name from experiments/ (rf3, boltz, protenix, etc.), "
+            "comma-separated job shortcut from full_8gpu, or 'full'/'full_8gpu'."
         ),
     )
     parser.add_argument(
@@ -161,6 +162,9 @@ def _resolve_target(
     if target.endswith(".toml") or "/" in target:
         parser.error("pass custom preset paths with --preset path/to/preset.toml")
 
+    if "," not in target and target in loader.list_presets():
+        return target, ""
+
     return DEFAULT_PRESET, target
 
 
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index 163fdcd1..ff21f527 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -14,10 +14,17 @@ def test_list_prints_all_experiment_presets(capsys: pytest.CaptureFixture[str])
     assert exit_code == 0
     out = capsys.readouterr().out.splitlines()
     assert set(out) == {
+        "boltz",
+        "boltz1",
+        "boltz2",
+        "boltz2_md",
+        "boltz2_xrd",
         "full_8gpu",
+        "protenix",
+        "protenix_dual",
+        "rf3",
         "rf3_partial",
         "rf3_partial_chiral_off",
-        "protenix_dual",
         "rf3_protenix",
     }
 
@@ -69,6 +76,31 @@ def test_job_shortcut_filters_default_preset(
     assert "boltz2_md" not in out
 
 
+def test_model_target_uses_named_preset(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    """A single model target resolves to the matching standalone preset."""
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(["boltz", "--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: boltz" in out
+    assert "name: boltz2_xrd" in out
+    assert "name: boltz2_md" in out
+
+
+def test_boltz1_target_uses_named_preset(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    """The Boltz1 model has its own standalone preset target."""
+    monkeypatch.setenv("HOME", "/home/test")
+    exit_code = cli.main(["boltz1", "--show"])
+    assert exit_code == 0
+    out = capsys.readouterr().out
+    assert "name: boltz1" in out
+    assert "output_subdir: boltz1" in out
+
+
 def test_jobs_filters_explicit_preset(
     monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
 ) -> None:
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index e8b6cf45..2e125879 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -8,10 +8,23 @@
 from sampleworks.runs import loader
 
 
-BUNDLED = ["full_8gpu", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"]
-
-
-def test_list_presets_returns_the_five() -> None:
+BUNDLED = [
+    "boltz",
+    "boltz1",
+    "boltz2",
+    "boltz2_md",
+    "boltz2_xrd",
+    "full_8gpu",
+    "protenix",
+    "protenix_dual",
+    "rf3",
+    "rf3_partial",
+    "rf3_partial_chiral_off",
+    "rf3_protenix",
+]
+
+
+def test_list_presets_returns_bundled_experiments() -> None:
     """Preset discovery returns the expected bundled experiment names."""
     names = loader.list_presets()
     assert set(names) == set(BUNDLED), f"unexpected experiment presets: {names}"

From d86a2997a709c0383a97d33efcf0afe90f7e9e5b Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 14:33:50 -0400
Subject: [PATCH 24/28] feat(runs): auto-assign preset GPUs

---
 README.md                               | 10 ++-
 experiments/boltz.toml                  |  4 +-
 experiments/boltz1.toml                 |  2 +-
 experiments/boltz2.toml                 |  4 +-
 experiments/boltz2_md.toml              |  2 +-
 experiments/boltz2_xrd.toml             |  2 +-
 experiments/full_8gpu.toml              |  8 +-
 experiments/protenix.toml               |  2 +-
 experiments/protenix_dual.toml          |  4 +-
 experiments/rf3.toml                    |  2 +-
 experiments/rf3_partial.toml            |  2 +-
 experiments/rf3_partial_chiral_off.toml |  2 +-
 experiments/rf3_protenix.toml           |  4 +-
 src/sampleworks/runs/cli.py             |  7 +-
 src/sampleworks/runs/loader.py          |  8 +-
 src/sampleworks/runs/runner.py          | 99 +++++++++++++++++++++----
 src/sampleworks/runs/schema.py          | 24 +++---
 tests/runs/test_cli.py                  |  5 +-
 tests/runs/test_runner.py               | 68 ++++++++++++++++-
 19 files changed, 209 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 3f107659..2142338a 100644
--- a/README.md
+++ b/README.md
@@ -195,7 +195,8 @@ run_experiments full_8gpu --jobs rf3,protenix
 Standalone presets are available for each model/model family: `boltz`,
 `boltz1`, `boltz2`, `boltz2_xrd`, `boltz2_md`, `rf3`, and `protenix`.
 Additional comparison presets include `protenix_dual`, `rf3_protenix`, and RF3
-variants.
+variants. Single-job presets default to `gpu_count = 8`, so on an 8-GPU pod
+they use the whole machine.
 
 Presets live in `experiments/*.toml` in your local checkout and on the pod at
 `/home/dev/workspace/experiments/*.toml`. To modify an experiment, edit or copy
@@ -210,10 +211,15 @@ run_experiments --preset my_rf3
 For one-off changes, use `--set` instead of editing TOML:
 
 ```bash
-run_experiments rf3 --set jobs.rf3.gpus=0,1
+run_experiments rf3 --set jobs.rf3.gpu_count=4
 run_experiments rf3 --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
 ```
 
+Presets usually declare `gpu_count = N`, not fixed GPU IDs. The runner assigns
+visible GPUs automatically in job order, so the same preset works on different
+pod sizes. Use explicit `gpus = "0,1"` only when you need to pin a job to
+specific devices.
+
 Defaults: inputs come from `/mnt/diffuse-shared/raw/sampleworks/...`, checkpoints
 from `/mnt/diffuse-shared/raw/checkpoints`, results go to
 `/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`, and MSA caches go to
diff --git a/experiments/boltz.toml b/experiments/boltz.toml
index b7addab7..6dde9089 100644
--- a/experiments/boltz.toml
+++ b/experiments/boltz.toml
@@ -18,13 +18,13 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 4
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz1.toml b/experiments/boltz1.toml
index 26df3bf5..f1a8a2cf 100644
--- a/experiments/boltz1.toml
+++ b/experiments/boltz1.toml
@@ -22,6 +22,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz1"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz1"
 args = {}
diff --git a/experiments/boltz2.toml b/experiments/boltz2.toml
index 5f63265a..551ca0d5 100644
--- a/experiments/boltz2.toml
+++ b/experiments/boltz2.toml
@@ -18,13 +18,13 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 4
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz2_md.toml b/experiments/boltz2_md.toml
index a175050a..0db59a82 100644
--- a/experiments/boltz2_md.toml
+++ b/experiments/boltz2_md.toml
@@ -21,6 +21,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz2_md"
 args = {}
diff --git a/experiments/boltz2_xrd.toml b/experiments/boltz2_xrd.toml
index 9fc0c48c..10bdf3fb 100644
--- a/experiments/boltz2_xrd.toml
+++ b/experiments/boltz2_xrd.toml
@@ -21,6 +21,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz2_xrd"
 args = {}
diff --git a/experiments/full_8gpu.toml b/experiments/full_8gpu.toml
index fb700177..62c1cb06 100644
--- a/experiments/full_8gpu.toml
+++ b/experiments/full_8gpu.toml
@@ -18,27 +18,27 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 2
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 2
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "4,5"
+gpu_count = 2
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1" }
 
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "6,7"
+gpu_count = 2
 output_subdir = "protenix"
 args = { model = "protenix", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/protenix.toml b/experiments/protenix.toml
index 383daf97..1c94a364 100644
--- a/experiments/protenix.toml
+++ b/experiments/protenix.toml
@@ -20,6 +20,6 @@ align-to-input = true
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "protenix"
 args = {}
diff --git a/experiments/protenix_dual.toml b/experiments/protenix_dual.toml
index 231ee220..50f61e8b 100644
--- a/experiments/protenix_dual.toml
+++ b/experiments/protenix_dual.toml
@@ -22,13 +22,13 @@ align-to-input = true
 [[jobs]]
 name = "protenix_tiny"
 env = "protenix"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "protenix_tiny"
 args = { model-checkpoint = "${PROTENIX_TINY_CHECKPOINT}" }
 
 [[jobs]]
 name = "protenix_mini"
 env = "protenix"
-gpus = "6,7"
+gpu_count = 4
 output_subdir = "protenix_mini"
 args = { model-checkpoint = "${PROTENIX_MINI_CHECKPOINT}" }
diff --git a/experiments/rf3.toml b/experiments/rf3.toml
index 40a2f559..4e63c128 100644
--- a/experiments/rf3.toml
+++ b/experiments/rf3.toml
@@ -22,6 +22,6 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "rf3"
 args = {}
diff --git a/experiments/rf3_partial.toml b/experiments/rf3_partial.toml
index 60a063e1..7937e9a6 100644
--- a/experiments/rf3_partial.toml
+++ b/experiments/rf3_partial.toml
@@ -19,6 +19,6 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "4"
+gpu_count = 8
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/experiments/rf3_partial_chiral_off.toml b/experiments/rf3_partial_chiral_off.toml
index af0e5ac8..562ae2e7 100644
--- a/experiments/rf3_partial_chiral_off.toml
+++ b/experiments/rf3_partial_chiral_off.toml
@@ -21,6 +21,6 @@ disable-chiral-features = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "5"
+gpu_count = 8
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/experiments/rf3_protenix.toml b/experiments/rf3_protenix.toml
index 4ca5638d..7b9996cb 100644
--- a/experiments/rf3_protenix.toml
+++ b/experiments/rf3_protenix.toml
@@ -17,13 +17,13 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "0,1,2,3"
+gpu_count = 4
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.01 0.02 0.05 0.1" }
 
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "4,5,6,7"
+gpu_count = 4
 output_subdir = "protenix"
 args = { model = "protenix", partial-diffusion-step = 120, gradient-weights = "0.0 0.1 0.2 0.5" }
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
index 94055a19..008d3f99 100644
--- a/src/sampleworks/runs/cli.py
+++ b/src/sampleworks/runs/cli.py
@@ -110,7 +110,7 @@ def _build_parser() -> argparse.ArgumentParser:
             "Override a value in the loaded preset. Examples: "
             "--set defaults.DATA_DIR=/data/foo, "
             "--set jobs.rf3.args.gradient-weights='0.0 0.01', "
-            "--set jobs.0.gpus=5"
+            "--set jobs.0.gpu_count=4"
         ),
     )
     parser.add_argument(
@@ -224,7 +224,10 @@ def _print_show(preset: Preset) -> None:
     for j in preset.jobs:
         print(f"  - name: {j.name}")
         print(f"    env: {j.env}")
-        print(f"    gpus: {j.gpus}")
+        if j.gpus:
+            print(f"    gpus: {j.gpus}")
+        else:
+            print(f"    gpu_count: {j.gpu_count}")
         print(f"    output_subdir: {j.output_subdir}")
         print("    args:")
         for k, v in j.args.items():
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index d5f88e4d..e8929263 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -501,8 +501,9 @@ def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
         Job(
             name=str(j["name"]),
             env=str(j["env"]),
-            gpus=str(j["gpus"]),
             output_subdir=str(j["output_subdir"]),
+            gpus=str(j.get("gpus", "")),
+            gpu_count=_optional_int(j.get("gpu_count")),
             args=dict(j.get("args", {})),
         )
         for j in raw_jobs
@@ -514,3 +515,8 @@ def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
         shared_args=dict(raw.get("shared_args", {})),
         jobs=jobs,
     )
+
+
+def _optional_int(value: Any) -> int | None:
+    """Return ``value`` as an int, preserving ``None`` for absent fields."""
+    return None if value is None else int(value)
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 6d83b231..1442adf9 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -34,6 +34,9 @@ class JobInvocation:
         by ``run_grid_search.py``.
     env : dict of str to str
         Process environment, including ``CUDA_VISIBLE_DEVICES``.
+    gpus : str
+        Resolved CUDA-visible GPU assignment. For jobs that declare
+        ``gpu_count``, this is the concrete auto-assigned GPU list.
     log_path : Path
         File to tee stdout+stderr into.
     output_dir : Path
@@ -44,6 +47,7 @@ class JobInvocation:
     job: Job
     argv: list[str]
     env: dict[str, str]
+    gpus: str
     log_path: Path
     output_dir: Path
 
@@ -67,20 +71,82 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
     list of JobInvocation
         One :class:`JobInvocation` per job, in declaration order.
     """
+    gpu_assignments = _resolve_gpu_assignments(preset.jobs)
     invocations: list[JobInvocation] = []
     for job in preset.jobs:
         args = preset.effective_args(job)
         args.setdefault("output-dir", str(results_dir / job.output_subdir))
         argv = _build_argv(job.env, args)
-        env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus})
+        gpus = gpu_assignments[job.name]
+        env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": gpus})
         log_path = results_dir / f"{job.name}_run.log"
         output_dir = Path(args["output-dir"])
         invocations.append(
-            JobInvocation(job=job, argv=argv, env=env, log_path=log_path, output_dir=output_dir)
+            JobInvocation(
+                job=job,
+                argv=argv,
+                env=env,
+                gpus=gpus,
+                log_path=log_path,
+                output_dir=output_dir,
+            )
         )
     return invocations
 
 
+def _resolve_gpu_assignments(jobs: list[Job]) -> dict[str, str]:
+    """Resolve explicit ``gpus`` and automatic ``gpu_count`` declarations.
+
+    Explicit assignments reserve those GPU tokens. Jobs with ``gpu_count`` then
+    consume remaining visible GPU IDs in preset declaration order. When GPU
+    discovery is unavailable (for local dry-runs/tests), synthetic ordinals are
+    generated so command construction stays deterministic.
+    """
+    explicit: dict[str, str] = {job.name: job.gpus for job in jobs if job.gpus}
+    reserved = {gpu for value in explicit.values() for gpu in _split_gpu_list(value)}
+    total_auto = sum(job.gpu_count or 0 for job in jobs)
+    available = _detect_available_gpus()
+    if available:
+        pool = [gpu for gpu in available if gpu not in reserved]
+        if len(pool) < total_auto:
+            raise RuntimeError(
+                "Not enough visible GPUs for preset auto-assignment. "
+                f"Visible GPUs: {available}. Reserved GPUs: {sorted(reserved)}. "
+                f"Auto-requested GPUs: {total_auto}."
+            )
+    elif _cuda_visible_devices_disables_gpus() and total_auto:
+        raise RuntimeError(
+            "CUDA_VISIBLE_DEVICES disables GPU access, so gpu_count auto-assignment "
+            "cannot allocate any GPUs."
+        )
+    else:
+        pool = _synthetic_gpu_pool(reserved, total_auto)
+
+    assignments: dict[str, str] = {}
+    cursor = 0
+    for job in jobs:
+        if job.gpus:
+            assignments[job.name] = job.gpus
+            continue
+        count = job.gpu_count or 0
+        assigned = pool[cursor : cursor + count]
+        cursor += count
+        assignments[job.name] = ",".join(assigned)
+    return assignments
+
+
+def _synthetic_gpu_pool(reserved: set[str], count: int) -> list[str]:
+    """Return deterministic CUDA ordinals when real GPU discovery is unavailable."""
+    pool: list[str] = []
+    candidate = 0
+    while len(pool) < count:
+        token = str(candidate)
+        if token not in reserved:
+            pool.append(token)
+        candidate += 1
+    return pool
+
+
 def _split_gpu_list(value: str) -> list[str]:
     """Split a comma-separated GPU assignment into normalized tokens.
 
@@ -123,12 +189,10 @@ def _detect_available_gpus() -> list[str]:
         available. Empty means validation should be skipped.
     """
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
-    if cuda_visible and cuda_visible.lower() not in {
-        "all",
-        "none",
-        "void",
-        "nodevfiles",
-    }:
+    cuda_visible_key = cuda_visible.lower()
+    if _cuda_visible_devices_disables_gpus():
+        return []
+    if cuda_visible and cuda_visible_key != "all":
         return _split_gpu_list(cuda_visible)
 
     try:
@@ -145,6 +209,15 @@ def _detect_available_gpus() -> list[str]:
     return [line.strip() for line in result.stdout.splitlines() if line.strip()]
 
 
+def _cuda_visible_devices_disables_gpus() -> bool:
+    """Return True when CUDA_VISIBLE_DEVICES explicitly hides all GPUs."""
+    return os.environ.get("CUDA_VISIBLE_DEVICES", "").strip().lower() in {
+        "none",
+        "void",
+        "nodevfiles",
+    }
+
+
 def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
     """Fail fast when a preset asks for GPUs not present in this pod.
 
@@ -166,7 +239,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
 
     requested: dict[str, list[str]] = {}
     for inv in invocations:
-        for gpu in _split_gpu_list(inv.job.gpus):
+        for gpu in _split_gpu_list(inv.gpus):
             requested.setdefault(gpu, []).append(inv.job.name)
 
     requested_tokens = list(requested)
@@ -183,7 +256,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
         raise RuntimeError(
             "Preset requests GPUs that are not visible in this pod. "
             f"Visible GPUs: {', '.join(available)}. {details}. "
-            "Edit the preset's jobs.*.gpus values or run a smaller --jobs subset."
+            "Edit the preset's jobs.*.gpus/gpu_count values or run a smaller --jobs subset."
         )
 
     allow_oversubscription = os.environ.get(
@@ -517,9 +590,9 @@ def _print_dry_run(inv: JobInvocation) -> None:
     inv : JobInvocation
         Invocation to print.
     """
-    print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
+    print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.gpus})", file=sys.stderr)
     print(f"# log: {inv.log_path}", file=sys.stderr)
-    print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}")
+    print(f"CUDA_VISIBLE_DEVICES={inv.gpus} {_shell_join(inv.argv)}")
     print(file=sys.stderr)
 
 
@@ -540,7 +613,7 @@ def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> N
         print(f"  {preset.description}", file=sys.stderr)
     for inv in invocations:
         print(
-            f"  - {inv.job.name}: env={inv.job.env}, gpus={inv.job.gpus}, log={inv.log_path}",
+            f"  - {inv.job.name}: env={inv.job.env}, gpus={inv.gpus}, log={inv.log_path}",
             file=sys.stderr,
         )
     print(bar, file=sys.stderr)
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
index 37b64e9c..9cb99fac 100644
--- a/src/sampleworks/runs/schema.py
+++ b/src/sampleworks/runs/schema.py
@@ -2,8 +2,8 @@
 
 A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
 runs in its configured model environment, either through ``pixi run`` or a
-baked environment Python, with ``CUDA_VISIBLE_DEVICES`` set to the job's GPU
-assignment.
+baked environment Python, with ``CUDA_VISIBLE_DEVICES`` set from an explicit
+GPU assignment or an automatically allocated ``gpu_count``.
 """
 
 from __future__ import annotations
@@ -28,8 +28,11 @@ class Job:
         Pixi environment to run the job in. Must be one of
         :data:`VALID_PIXI_ENVS`.
     gpus : str
-        Value to set as ``CUDA_VISIBLE_DEVICES`` for the subprocess (e.g.
-        ``"4"`` or ``"0,1"``).
+        Explicit value to set as ``CUDA_VISIBLE_DEVICES`` for the subprocess
+        (e.g. ``"4"`` or ``"0,1"``). Mutually exclusive with ``gpu_count``.
+    gpu_count : int or None, optional
+        Number of visible GPUs to auto-assign for this job. The runner assigns
+        concrete GPU IDs in declaration order.
     output_subdir : str
         Path appended to the run's ``results_dir`` to form the job's
         ``--output-dir`` argument, when one is not given explicitly in ``args``.
@@ -42,14 +45,15 @@ class Job:
     Raises
     ------
     ValueError
-        If ``env`` is not in :data:`VALID_PIXI_ENVS`, or if ``gpus`` /
-        ``output_subdir`` is empty.
+        If ``env`` is not in :data:`VALID_PIXI_ENVS`, if neither/both ``gpus``
+        and ``gpu_count`` are set, or if ``output_subdir`` is empty.
     """
 
     name: str
     env: str
-    gpus: str
     output_subdir: str
+    gpus: str = ""
+    gpu_count: int | None = None
     args: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self) -> None:
@@ -58,8 +62,10 @@ def __post_init__(self) -> None:
             raise ValueError(
                 f"Job {self.name!r}: env must be one of {VALID_PIXI_ENVS}, got {self.env!r}"
             )
-        if not self.gpus:
-            raise ValueError(f"Job {self.name!r}: gpus must be non-empty")
+        if bool(self.gpus) == (self.gpu_count is not None):
+            raise ValueError(f"Job {self.name!r}: set exactly one of gpus or gpu_count")
+        if self.gpu_count is not None and self.gpu_count <= 0:
+            raise ValueError(f"Job {self.name!r}: gpu_count must be positive")
         if not self.output_subdir:
             raise ValueError(f"Job {self.name!r}: output_subdir must be non-empty")
 
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
index ff21f527..7af38081 100644
--- a/tests/runs/test_cli.py
+++ b/tests/runs/test_cli.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 import pytest
-from sampleworks.runs import cli
+from sampleworks.runs import cli, runner
 
 
 def test_list_prints_all_experiment_presets(capsys: pytest.CaptureFixture[str]) -> None:
@@ -46,6 +46,7 @@ def test_dry_run_does_not_invoke_subprocess(
 ) -> None:
     """``--dry-run`` prints commands and CUDA assignment instead of executing."""
     monkeypatch.setenv("HOME", str(tmp_path))
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: [str(i) for i in range(8)])
     exit_code = cli.main(
         [
             "--preset",
@@ -58,7 +59,7 @@ def test_dry_run_does_not_invoke_subprocess(
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "pixi run -e rf3 python /app/run_grid_search.py" in out
-    assert "CUDA_VISIBLE_DEVICES=4" in out
+    assert "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" in out
 
 
 def test_job_shortcut_filters_default_preset(
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py
index 5eec8946..04a4b1b0 100644
--- a/tests/runs/test_runner.py
+++ b/tests/runs/test_runner.py
@@ -9,17 +9,18 @@
 
 
 def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Faithful translation: argv should match the canonical rf3_partial bash invocation."""
+    """RF3 partial builds the canonical argv and auto-assigns all GPUs."""
     monkeypatch.setenv("HOME", "/home/test")
     monkeypatch.delenv("DATA_DIR", raising=False)
     monkeypatch.delenv("RESULTS_DIR", raising=False)
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: [str(i) for i in range(8)])
     preset = loader.load_preset("rf3_partial")
     invocations = runner.build_invocations(preset, results_dir=Path("/results"))
 
     assert len(invocations) == 1
     inv = invocations[0]
     assert inv.job.name == "rf3"
-    assert inv.env["CUDA_VISIBLE_DEVICES"] == "4"
+    assert inv.env["CUDA_VISIBLE_DEVICES"] == "0,1,2,3,4,5,6,7"
     assert inv.log_path == Path("/results/rf3_run.log")
 
     argv = inv.argv
@@ -71,6 +72,7 @@ def test_full_8gpu_has_four_jobs_with_distinct_gpus(
 ) -> None:
     """The full_8gpu preset maps its four jobs onto distinct GPU pairs."""
     monkeypatch.setenv("HOME", "/home/test")
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: [str(i) for i in range(8)])
     preset = loader.load_preset("full_8gpu")
     invocations = runner.build_invocations(preset, results_dir=Path("/r"))
     assert [i.job.name for i in invocations] == ["boltz2_xrd", "boltz2_md", "rf3", "protenix"]
@@ -78,6 +80,68 @@ def test_full_8gpu_has_four_jobs_with_distinct_gpus(
     assert gpu_assignments == ["0,1", "2,3", "4,5", "6,7"]
 
 
+def test_single_job_presets_use_all_eight_gpus(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Standalone single-job presets default to all eight visible GPUs."""
+    monkeypatch.setenv("HOME", "/home/test")
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: [str(i) for i in range(8)])
+    all_gpus = "0,1,2,3,4,5,6,7"
+    for name in ("boltz1", "boltz2_xrd", "boltz2_md", "rf3", "protenix"):
+        preset = loader.load_preset(name)
+        invocations = runner.build_invocations(preset, results_dir=Path("/r"))
+        assert len(invocations) == 1
+        assert invocations[0].env["CUDA_VISIBLE_DEVICES"] == all_gpus
+
+
+def test_gpu_count_uses_visible_gpus_in_order(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Auto GPU allocation consumes visible GPU IDs in preset order."""
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: ["4", "5", "6"])
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        "[shared_args]\n"
+        '[[jobs]]\nname = "a"\nenv = "rf3"\ngpu_count = 2\noutput_subdir = "a"\n'
+        '[[jobs]]\nname = "b"\nenv = "rf3"\ngpu_count = 1\noutput_subdir = "b"\n'
+    )
+    preset = loader.load_preset(str(custom))
+    invocations = runner.build_invocations(preset, results_dir=tmp_path / "results")
+
+    assert [inv.gpus for inv in invocations] == ["4,5", "6"]
+    assert [inv.env["CUDA_VISIBLE_DEVICES"] for inv in invocations] == ["4,5", "6"]
+
+
+def test_gpu_count_respects_explicit_gpu_reservations(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Auto GPU allocation skips GPUs already claimed explicitly."""
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: ["0", "1", "2", "3"])
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        "[shared_args]\n"
+        '[[jobs]]\nname = "manual"\nenv = "rf3"\ngpus = "2"\noutput_subdir = "manual"\n'
+        '[[jobs]]\nname = "auto"\nenv = "rf3"\ngpu_count = 2\noutput_subdir = "auto"\n'
+    )
+    preset = loader.load_preset(str(custom))
+    invocations = runner.build_invocations(preset, results_dir=tmp_path / "results")
+
+    assert [inv.gpus for inv in invocations] == ["2", "0,1"]
+
+
+def test_gpu_count_rejects_insufficient_visible_gpus(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Auto GPU allocation fails clearly when visible GPUs are exhausted."""
+    monkeypatch.setattr(runner, "_detect_available_gpus", lambda: ["0"])
+    custom = tmp_path / "custom.toml"
+    custom.write_text(
+        '[shared_args]\n[[jobs]]\nname = "a"\nenv = "rf3"\ngpu_count = 2\noutput_subdir = "a"\n'
+    )
+    preset = loader.load_preset(str(custom))
+
+    with pytest.raises(RuntimeError, match="Not enough visible GPUs"):
+        runner.build_invocations(preset, results_dir=tmp_path / "results")
+
+
 def test_protenix_dual_uses_different_checkpoints(monkeypatch: pytest.MonkeyPatch) -> None:
     """The Protenix dual preset uses separate tiny and mini checkpoints."""
     monkeypatch.setenv("HOME", "/home/test")

From b508c511fd32783aa0ef5169d24577a3cb896171 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Sat, 23 May 2026 17:24:27 -0400
Subject: [PATCH 25/28] fix(runs): clear GPU override counterpart

---
 src/sampleworks/runs/loader.py |  4 ++++
 tests/runs/test_loader.py      | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
index e8929263..fcec6441 100644
--- a/src/sampleworks/runs/loader.py
+++ b/src/sampleworks/runs/loader.py
@@ -274,6 +274,10 @@ def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
         leaf_parent[_find_in_list(leaf_parent, leaf_key, where=dotted)] = value
     else:
         leaf_parent[leaf_key] = value
+        if parts[0] == "jobs" and len(parts) == 3 and leaf_key == "gpus":
+            leaf_parent.pop("gpu_count", None)
+        elif parts[0] == "jobs" and len(parts) == 3 and leaf_key == "gpu_count":
+            leaf_parent.pop("gpus", None)
 
 
 def _index(cursor: Any, part: str, *, where: str) -> Any:
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
index 2e125879..0d3dd672 100644
--- a/tests/runs/test_loader.py
+++ b/tests/runs/test_loader.py
@@ -83,6 +83,7 @@ def test_set_override_at_job_by_name(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("full_8gpu", overrides=["jobs.rf3.gpus=7"])
     assert preset.job("rf3").gpus == "7"
+    assert preset.job("rf3").gpu_count is None
 
 
 def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -90,6 +91,27 @@ def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("HOME", "/home/test")
     preset = loader.load_preset("full_8gpu", overrides=["jobs.0.gpus=9"])
     assert preset.jobs[0].gpus == "9"
+    assert preset.jobs[0].gpu_count is None
+
+
+def test_set_override_gpu_count_clears_gpus(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """``--set jobs.<name>.gpu_count`` replaces an explicit GPU assignment."""
+    monkeypatch.setenv("HOME", "/home/test")
+    custom = tmp_path / "gpu_count.toml"
+    custom.write_text(
+        'description = "custom"\n'
+        "[[jobs]]\n"
+        'name = "j1"\n'
+        'env = "rf3"\n'
+        'gpus = "0"\n'
+        'output_subdir = "j1"\n'
+        "args = {}\n"
+    )
+    preset = loader.load_preset(str(custom), overrides=["jobs.j1.gpu_count=2"])
+    assert preset.job("j1").gpu_count == 2
+    assert preset.job("j1").gpus == ""
 
 
 def test_set_override_at_args_inside_job(monkeypatch: pytest.MonkeyPatch) -> None:

From 797600c19900b59b5effe7c6c61260655c1f3671 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Mon, 25 May 2026 14:26:03 -0400
Subject: [PATCH 26/28] ci(docker): tag Harbor sampleworks image

---
 .github/workflows/docker.yml | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index d5123f31..9e5350c4 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -1,7 +1,7 @@
 name: Build and Push Docker Images
 
-# CI builds pull all model checkpoints (~10 GB) from Docker Hub automatically via:
-#   COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/
+# CI builds pull all model checkpoints (~10 GB) from Harbor automatically via:
+#   COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/
 # No checkpoint files are needed on the CI runner. The checkpoints base image is
 # pre-built and pushed from the GPU server. See Dockerfile comments for details.
 
@@ -13,8 +13,8 @@ on:
   workflow_dispatch:
 
 env:
-  DOCKERHUB_ORG: diffuseproject
-  IMAGE_NAME: sampleworks
+  REGISTRY: harbor.astera.sh
+  IMAGE_NAME: library/pixi-with-checkpoints
 
 jobs:
   build:
@@ -39,24 +39,25 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v4
 
-      - name: Login to Docker Hub
+      - name: Login to Harbor
         uses: docker/login-action@v4
         with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.HARBOR_USERNAME }}
+          password: ${{ secrets.HARBOR_PASSWORD }}
 
-      # The Dockerfile uses COPY --from=diffuseproject/sampleworks-checkpoints:latest
-      # which Docker automatically pulls from Docker Hub during the build.
+      # The Dockerfile uses COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest
+      # which Docker automatically pulls from Harbor during the build.
       # No checkpoint files are needed in the CI build context.
 
       - name: Docker metadata
         id: meta
         uses: docker/metadata-action@v6
         with:
-          images: ${{ env.DOCKERHUB_ORG }}/${{ env.IMAGE_NAME }}
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
           tags: |
-            type=raw,value=latest
-            type=sha,prefix=
+            type=raw,value=sampleworks
+            type=sha,prefix=sha-
             type=semver,pattern={{version}}
             type=semver,pattern=v{{version}}
       - name: Build and push Docker image

From 866c4dac06178be307db086a7204adce2618ef18 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Mon, 25 May 2026 19:18:36 -0400
Subject: [PATCH 27/28] fix(runs): address ACTL review feedback

---
 README.md                               |  37 ++++--
 run_experiments                         | 121 +++++++++++++++++--
 run_grid_search.py                      | 149 +++++++++++++++++-------
 src/sampleworks/models/boltz/wrapper.py |   6 +-
 4 files changed, 252 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index 2142338a..082443f4 100644
--- a/README.md
+++ b/README.md
@@ -154,11 +154,17 @@ Instructions for running evaluation and metrics scripts are coming soon.
 
 ## Running preset experiments on ACTL (`run_experiments`)
 
+This section is Astera-specific: it assumes access to ACTL, the internal Harbor
+image registry, and the `diffuse-shared` PVC. External users can run the same
+TOML presets with `sampleworks-runs` or `python -m sampleworks.runs.cli` after
+setting equivalent local paths for `DATA_DIR`, `PROTEINS_CSV`, `RESULTS_DIR`,
+`MSA_CACHE_DIR`, and model checkpoints.
+
 Start an 8-GPU ACTL machine named `sampleworks` with the Sampleworks image and
 the shared data volume mounted:
 
 ```bash
-actl pod up sampleworks --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:latest --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
+actl pod up sampleworks --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes
 ```
 
 Keep that terminal open; it maintains sync and SSH. From another terminal:
@@ -175,14 +181,18 @@ right `run_grid_search.py` jobs, pixi environments, GPU assignments, logs,
 results directory, and MSA cache.
 
 ```bash
-run_experiments --list        # show available presets
+export DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps
+export PROTEINS_CSV="$DATA_DIR/proteins.csv"
+export SAMPLEWORKS_ACTL_RUN_NAME="$(hostname -s)"
+
+run_experiments --list        # show available presets (does not require DATA_DIR)
 run_experiments --show rf3    # inspect what will run
 run_experiments --dry-run rf3 # print commands without running
 run_experiments rf3           # run the standalone RF3 preset
 run_experiments boltz         # run Boltz2 X-ray + Boltz2 MD
 run_experiments boltz1        # run standalone Boltz1
 run_experiments protenix      # run the standalone Protenix preset
-run_experiments               # run the default full_8gpu preset
+run_experiments full_8gpu     # run the full 8-GPU comparison preset
 ```
 
 The default `full_8gpu` preset runs Boltz2 XRD, Boltz2 MD, RF3, and Protenix in
@@ -217,14 +227,23 @@ run_experiments rf3 --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
 
 Presets usually declare `gpu_count = N`, not fixed GPU IDs. The runner assigns
 visible GPUs automatically in job order, so the same preset works on different
-pod sizes. Use explicit `gpus = "0,1"` only when you need to pin a job to
-specific devices.
+pod sizes and fails fast if the pod has fewer visible GPUs than requested. Use
+explicit `gpus = "0,1"` only when you need to pin a job to specific devices; the
+runner validates those IDs before launching jobs.
 
-Defaults: inputs come from `/mnt/diffuse-shared/raw/sampleworks/...`, checkpoints
-from `/mnt/diffuse-shared/raw/checkpoints`, results go to
+Set `DATA_DIR` and `PROTEINS_CSV` explicitly for each run so they are captured in
+the shell history and launch logs. Checkpoints default to
+`/mnt/diffuse-shared/raw/checkpoints` when those files exist, results go to
 `/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`, and MSA caches go to
-`/mnt/diffuse-shared/cache/sampleworks/msa`. Override with `DATA_DIR`,
-`RESULTS_DIR`, or `MSA_CACHE_DIR` before running.
+`/mnt/diffuse-shared/cache/sampleworks/msa`. Override with `RESULTS_DIR`,
+`MSA_CACHE_DIR`, or model-specific checkpoint variables before running.
+
+The ACTL image contains baked pixi environments under `/app/.pixi`. If your
+synced branch changes `pyproject.toml` or `pixi.lock`, `run_experiments` stops
+with a clear error instead of mutating the baked environment. For dependency
+debugging only, opt into an on-pod pixi update with
+`SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 run_experiments ...`; reproducible scientist
+runs should use a rebuilt `pixi-with-checkpoints:sampleworks` image instead.
 
 
 ## Docker
diff --git a/run_experiments b/run_experiments
index 84e831e5..eca31bce 100755
--- a/run_experiments
+++ b/run_experiments
@@ -7,6 +7,9 @@
 
 set -euo pipefail
 
+# Resolve the physical location of this wrapper even when it is invoked through
+# the /usr/local/bin symlink baked into the ACTL image. Later fallbacks use this
+# directory to find a checkout when /home/dev/workspace is not available.
 script_path="${BASH_SOURCE[0]}"
 while [[ -L "$script_path" ]]; do
     script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)"
@@ -36,6 +39,37 @@ find_sampleworks_root_upwards() {
     return 1
 }
 
+truthy_env() {
+    local name="$1"
+    [[ "${!name:-}" =~ ^(1|true|yes)$ ]]
+}
+
+require_env_var() {
+    local name="$1"
+    local help_text="$2"
+    if [[ -z "${!name:-}" ]]; then
+        cat >&2 <<EOF
+$name must be set explicitly for run_experiments.
+
+$help_text
+EOF
+        return 2
+    fi
+}
+
+pixi_inputs_match_image() {
+    local image_root="$1"
+    local source_root="$2"
+
+    # Older images do not have enough metadata to validate; let the existing
+    # prebuilt-env checks handle those cases.
+    [[ -f "$image_root/pyproject.toml" && -f "$image_root/pixi.lock" ]] || return 0
+    [[ -f "$source_root/pyproject.toml" && -f "$source_root/pixi.lock" ]] || return 0
+
+    cmp -s "$image_root/pyproject.toml" "$source_root/pyproject.toml" && \
+        cmp -s "$image_root/pixi.lock" "$source_root/pixi.lock"
+}
+
 resolve_repo_root() {
     local source_override="${SAMPLEWORKS_SOURCE_DIR:-}"
     if [[ -n "$source_override" ]]; then
@@ -88,7 +122,7 @@ EOF
 repo_root="$(resolve_repo_root)"
 
 env_preset="${SAMPLEWORKS_PRESET:-}"
-default_target="${env_preset:-full_8gpu}"
+default_target="$env_preset"
 target=""
 explicit_preset=""
 explicit_jobs=""
@@ -140,6 +174,33 @@ for arg in "$@"; do
     esac
 done
 
+needs_run_config=1
+for arg in "$@"; do
+    case "$arg" in
+        --list|-h|--help)
+            needs_run_config=0
+            ;;
+    esac
+done
+
+if [[
+    "$needs_run_config" -eq 1 &&
+    -z "$target" &&
+    -z "$explicit_preset" &&
+    -z "$explicit_jobs" &&
+    -z "$env_preset"
+]]; then
+    cat >&2 <<'EOF'
+run_experiments requires an explicit preset or job selector.
+
+Examples:
+  run_experiments rf3
+  run_experiments full_8gpu --jobs rf3,protenix
+  run_experiments --preset experiments/my_rf3.toml
+EOF
+    exit 2
+fi
+
 label_source="$default_target"
 if [[ -n "$explicit_preset" ]]; then
     label_source="$explicit_preset"
@@ -163,11 +224,17 @@ run_label="${run_label%.toml}"
 run_label="${run_label//,/_}"
 
 run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}"
-default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps"
 default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${run_label}"
 default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa"
 
-export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}"
+export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-}}"
+export PROTEINS_CSV="${PROTEINS_CSV:-${SAMPLEWORKS_PROTEINS_CSV:-}}"
+if [[ "$needs_run_config" -eq 1 ]]; then
+    require_env_var DATA_DIR \
+        "Set DATA_DIR to the dataset directory for this run, e.g. /mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps."
+    require_env_var PROTEINS_CSV \
+        "Set PROTEINS_CSV to the input manifest for this run, e.g. \$DATA_DIR/proteins.csv."
+fi
 if [[ -n "$explicit_results_dir" ]]; then
     export RESULTS_DIR="$explicit_results_dir"
 else
@@ -175,15 +242,11 @@ else
 fi
 export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}"
 export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}"
+# Append the old PYTHONPATH only when it is non-empty. This avoids a trailing
+# colon, which Python treats as the current working directory.
 export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}"
 export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}"
 export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}"
-# The ACTL image is expected to provide ready-to-use pixi envs under /app/.pixi.
-# Do not let sampleworks.runs.runner call `pixi run` just to "prepare" envs;
-# that can reinstall the CUDA stack inside the pod, especially when the synced
-# workspace source is older than this wrapper.
-export SAMPLEWORKS_REQUIRE_PREBUILT_PIXI="${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-1}"
-export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-1}"
 
 shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints"
 for checkpoint_var_and_file in \
@@ -207,7 +270,7 @@ for arg in "$@"; do
     esac
 done
 
-source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
+source_proteins_csv="$PROTEINS_CSV"
 if [[ "$needs_runtime_paths" -eq 1 && -f "$source_proteins_csv" ]]; then
     # The shared proteins.csv currently contains absolute /data/inputs paths,
     # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
@@ -225,12 +288,48 @@ fi
 runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}"
 pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}"
 if [[ -z "$pixi_project_dir" ]]; then
-    if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then
+    if ! pixi_inputs_match_image /app "$repo_root" && truthy_env SAMPLEWORKS_ALLOW_RUNTIME_PIXI; then
+        pixi_project_dir="$repo_root"
+    elif [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then
         pixi_project_dir="/app"
     else
         pixi_project_dir="$repo_root"
     fi
 fi
+export SAMPLEWORKS_PIXI_PROJECT_DIR="$pixi_project_dir"
+
+if ! pixi_inputs_match_image /app "$repo_root"; then
+    if truthy_env SAMPLEWORKS_ALLOW_RUNTIME_PIXI; then
+        cat >&2 <<EOF
+Synced pyproject.toml or pixi.lock differs from the baked image. Runtime pixi
+updates are enabled, so using the synced checkout as the pixi project:
+  $repo_root
+EOF
+        export SAMPLEWORKS_REQUIRE_PREBUILT_PIXI="${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-0}"
+        export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-0}"
+    else
+        cat >&2 <<EOF
+Synced pyproject.toml or pixi.lock differs from the baked pixi-with-checkpoints image.
+
+Rebuild/use an image produced from this checkout, or intentionally update pixi
+inside this pod by running with:
+
+  SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 run_experiments ...
+
+Runtime pixi updates can be slow and may rebuild CUDA packages, so they are
+disabled by default for reproducible scientist runs.
+EOF
+        exit 2
+    fi
+else
+    # The ACTL image is expected to provide ready-to-use pixi envs under /app/.pixi.
+    # Do not let sampleworks.runs.runner call `pixi run` just to "prepare" envs;
+    # that can reinstall the CUDA stack inside the pod or spend a long time
+    # refreshing caches. Use SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 for dependency
+    # debugging against the synced checkout.
+    export SAMPLEWORKS_REQUIRE_PREBUILT_PIXI="${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-1}"
+    export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-1}"
+fi
 runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}"
 
 extra_cli_args=()
diff --git a/run_grid_search.py b/run_grid_search.py
index bc4c3173..3301a4c7 100755
--- a/run_grid_search.py
+++ b/run_grid_search.py
@@ -68,6 +68,65 @@ def get_job_status(job: JobConfig) -> str:
         return "failed"
 
 
+def _gpu_indices_from_torch() -> list[str] | None:
+    """Return visible CUDA ordinals using PyTorch when it is importable.
+
+    Returns
+    -------
+    list of str or None
+        Visible local CUDA ordinals. ``None`` means PyTorch is unavailable or
+        CUDA discovery failed before returning a device count.
+    """
+    try:
+        import torch
+    except ImportError:
+        return None
+
+    try:
+        if not torch.cuda.is_available():
+            return []
+        return [str(i) for i in range(torch.cuda.device_count())]
+    except Exception as exc:
+        log.debug(f"PyTorch CUDA discovery failed: {exc}")
+        return None
+
+
+def _gpu_indices_from_nvidia_smi() -> list[str] | None:
+    """Return visible CUDA ordinals using ``nvidia-smi`` as a fallback.
+
+    Returns
+    -------
+    list of str or None
+        GPU ordinals reported by ``nvidia-smi``. ``None`` means the command is
+        absent or failed.
+    """
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return None
+    if result.returncode != 0:
+        return None
+    return [g.strip() for g in result.stdout.strip().split("\n") if g.strip()]
+
+
+def _discover_gpu_indices() -> list[str] | None:
+    """Return visible CUDA ordinals from Python first, then ``nvidia-smi``.
+
+    Returns
+    -------
+    list of str or None
+        Visible GPU ordinals, or ``None`` when discovery is unavailable.
+    """
+    torch_indices = _gpu_indices_from_torch()
+    if torch_indices is not None:
+        return torch_indices
+    return _gpu_indices_from_nvidia_smi()
+
+
 def detect_gpus() -> list[str]:
     """Return CUDA GPU identifiers visible to this grid-search process.
 
@@ -80,37 +139,23 @@ def detect_gpus() -> list[str]:
     cuda_visible_key = cuda_visible.lower()
     if cuda_visible_key in {"none", "void", "nodevfiles"}:
         return []
+    if cuda_visible_key == "all":
+        return _discover_gpu_indices() or ["0"]
     if cuda_visible and cuda_visible_key != "all":
         gpus = [g.strip() for g in cuda_visible.split(",") if g.strip()]
-        try:
-            result = subprocess.run(
-                ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
-                capture_output=True,
-                text=True,
-            )
-            if result.returncode == 0:
-                visible = [g.strip() for g in result.stdout.strip().split("\n") if g.strip()]
-                if all(g.isdigit() for g in gpus + visible):
-                    missing = sorted(set(gpus).difference(visible), key=int)
-                    if missing:
-                        raise ValueError(
-                            "CUDA_VISIBLE_DEVICES references GPUs that are not visible "
-                            f"in this container: {missing}. Visible GPUs: {visible}. "
-                            "Check the preset jobs.*.gpus values for this pod size."
-                        )
-        except FileNotFoundError:
-            pass
+        visible = _discover_gpu_indices()
+        if visible and all(g.isdigit() for g in gpus + visible):
+            missing = sorted(set(gpus).difference(visible), key=int)
+            if missing:
+                raise ValueError(
+                    "CUDA_VISIBLE_DEVICES references GPUs that are not visible "
+                    f"in this container: {missing}. Visible GPUs: {visible}. "
+                    "Check the preset jobs.*.gpus values for this pod size."
+                )
         return gpus
-    try:
-        result = subprocess.run(
-            ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
-            capture_output=True,
-            text=True,
-        )
-        if result.returncode == 0:
-            return [g.strip() for g in result.stdout.strip().split("\n") if g.strip()]
-    except FileNotFoundError:
-        pass
+    discovered = _discover_gpu_indices()
+    if discovered is not None:
+        return discovered
     return ["0"]
 
 
@@ -223,7 +268,11 @@ def run_grid_search(
         for worker_num, job_queue_path in enumerate(job_queue_paths):
             model = worker_job_queues[worker_num][0].model
             future = executor.submit(
-                run_guidance_queue_script, (job_queue_path, model, worker_num, gpus)
+                run_guidance_queue_script,
+                job_queue_path,
+                model,
+                worker_num,
+                gpus,
             )
             futures[future] = job_queue_path
 
@@ -252,34 +301,49 @@ def run_grid_search(
     return results
 
 
-def run_guidance_queue_script(args: tuple[str, str, int, list[str]]):
+def run_guidance_queue_script(
+    job_queue_path: str,
+    model: str,
+    worker_num: int,
+    gpus: list[str],
+) -> subprocess.CompletedProcess[Any]:
     """Run one pickled guidance job queue in the model's pixi environment.
 
     Parameters
     ----------
-    args : tuple of str, str, int, and list of str
-        Job queue path, model name, worker index, and selected GPU entries. CUDA remaps
-        selected entries such as ``4,5`` to local process indices ``0,1``.
+    job_queue_path : str
+        Pickled queue of guidance jobs assigned to this worker.
+    model : str
+        Structure predictor name used to select the pixi environment.
+    worker_num : int
+        Zero-based worker index. This determines the local CUDA ordinal.
+    gpus : list of str
+        Selected GPU entries. CUDA remaps entries such as ``4,5`` to local
+        process indices ``0,1``.
+
+    Returns
+    -------
+    subprocess.CompletedProcess
+        Result from the subprocess that ran the worker queue.
     """
-    job_queue_path, model, worker_num, gpus = args
-    pixi_env = get_pixi_env(model)
+    pixi_env_name = get_pixi_env(model)
     script_path = Path(__file__).parent / "scripts" / "run_guidance_pipeline.py"
-    env_python = get_pixi_env_python(pixi_env)
+    env_python = get_pixi_env_python(pixi_env_name)
     if env_python:
         cmd = [env_python, str(script_path), "--job-queue-path", job_queue_path]
-        env = get_pixi_env_process_env(env_python)
+        process_env = get_pixi_env_process_env(env_python)
     else:
         cmd = [
             "pixi",
             "run",
             "-e",
-            pixi_env,
+            pixi_env_name,
             "python",
             str(script_path),
             "--job-queue-path",
             job_queue_path,
         ]
-        env = os.environ.copy()
+        process_env = os.environ.copy()
     local_gpu = worker_num % len(gpus)
     requested_gpu = gpus[local_gpu]
     if os.environ.get("CUDA_VISIBLE_DEVICES"):
@@ -292,7 +356,12 @@ def run_guidance_queue_script(args: tuple[str, str, int, list[str]]):
     )
 
     with open(job_queue_path.replace(".pkl", ".log"), "w") as log_file:
-        result = subprocess.run(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)
+        result = subprocess.run(
+            cmd,
+            stdout=log_file,
+            stderr=subprocess.STDOUT,
+            env=process_env,
+        )
     return result
 
 
diff --git a/src/sampleworks/models/boltz/wrapper.py b/src/sampleworks/models/boltz/wrapper.py
index 4efbaecc..d75b6ab5 100644
--- a/src/sampleworks/models/boltz/wrapper.py
+++ b/src/sampleworks/models/boltz/wrapper.py
@@ -362,7 +362,11 @@ def process_structure_for_boltz(
 
     # Keep Boltz dataloading in-process by default. Kubernetes pods usually get
     # a small /dev/shm, and torch DataLoader workers can exhaust it while
-    # sharing large featurized batches back to the parent process.
+    # sharing large featurized batches back to the parent process. This is
+    # Boltz-specific because the Protenix/RF3 wrappers do not expose an
+    # equivalent preprocessing worker pool here; callers can still pass
+    # ``num_workers`` explicitly when profiling shows that multiprocessing is
+    # worth the shared-memory tradeoff.
     config = BoltzConfig(
         out_dir=out_dir or structure.get("metadata", {}).get("id", "boltz_output"),
         num_workers=num_workers,

From e1f834144a620343df90471fbcb4ac6223ad42e6 Mon Sep 17 00:00:00 2001
From: xraymemory <me.anzuoni@gmail.com>
Date: Mon, 25 May 2026 21:45:39 -0400
Subject: [PATCH 28/28] docs(runs): document runtime pixi opt-in

---
 README.md                      |  4 ++--
 run_experiments                | 14 ++++++++++----
 src/sampleworks/runs/runner.py |  7 ++++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 082443f4..f9f31aa8 100644
--- a/README.md
+++ b/README.md
@@ -242,8 +242,8 @@ The ACTL image contains baked pixi environments under `/app/.pixi`. If your
 synced branch changes `pyproject.toml` or `pixi.lock`, `run_experiments` stops
 with a clear error instead of mutating the baked environment. For dependency
 debugging only, opt into an on-pod pixi update with
-`SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 run_experiments ...`; reproducible scientist
-runs should use a rebuilt `pixi-with-checkpoints:sampleworks` image instead.
+`RUNTIME_PIXI=1 run_experiments ...`; reproducible scientist runs should use a
+rebuilt `pixi-with-checkpoints:sampleworks` image instead.
 
 
 ## Docker
diff --git a/run_experiments b/run_experiments
index eca31bce..08d2700d 100755
--- a/run_experiments
+++ b/run_experiments
@@ -286,6 +286,12 @@ if [[ "$needs_runtime_paths" -eq 1 && -f "$source_proteins_csv" ]]; then
 fi
 
 runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}"
+# RUNTIME_PIXI=1 is the short scientist-facing escape hatch for branches whose
+# pyproject.toml or pixi.lock no longer match the baked image. Normalize it to
+# the internal flag that both this wrapper and sampleworks.runs.runner consume.
+if truthy_env RUNTIME_PIXI; then
+    export SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1
+fi
 pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}"
 if [[ -z "$pixi_project_dir" ]]; then
     if ! pixi_inputs_match_image /app "$repo_root" && truthy_env SAMPLEWORKS_ALLOW_RUNTIME_PIXI; then
@@ -314,7 +320,7 @@ Synced pyproject.toml or pixi.lock differs from the baked pixi-with-checkpoints
 Rebuild/use an image produced from this checkout, or intentionally update pixi
 inside this pod by running with:
 
-  SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 run_experiments ...
+  RUNTIME_PIXI=1 run_experiments ...
 
 Runtime pixi updates can be slow and may rebuild CUDA packages, so they are
 disabled by default for reproducible scientist runs.
@@ -325,7 +331,7 @@ else
     # The ACTL image is expected to provide ready-to-use pixi envs under /app/.pixi.
     # Do not let sampleworks.runs.runner call `pixi run` just to "prepare" envs;
     # that can reinstall the CUDA stack inside the pod or spend a long time
-    # refreshing caches. Use SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 for dependency
+    # refreshing caches. Use RUNTIME_PIXI=1 for dependency
     # debugging against the synced checkout.
     export SAMPLEWORKS_REQUIRE_PREBUILT_PIXI="${SAMPLEWORKS_REQUIRE_PREBUILT_PIXI:-1}"
     export SAMPLEWORKS_SKIP_ENV_PREPARE="${SAMPLEWORKS_SKIP_ENV_PREPARE:-1}"
@@ -391,7 +397,7 @@ if [[ -x "$runner_python" ]]; then
         "$@"
 fi
 
-if [[ ! "${SAMPLEWORKS_ALLOW_RUNTIME_PIXI:-}" =~ ^(1|true|yes)$ ]]; then
+if ! truthy_env SAMPLEWORKS_ALLOW_RUNTIME_PIXI; then
     cat >&2 <<EOF
 Prebuilt runner pixi environment is missing: $runner_python
 
@@ -400,7 +406,7 @@ ready-to-use environments under /app/.pixi. Refusing to run 'pixi run' because
 that would install or refresh packages inside the pod.
 
 Recreate the pod with the current pixi-with-checkpoints image. If you are
-intentionally debugging runtime pixi setup, set SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1.
+intentionally debugging runtime pixi setup, set RUNTIME_PIXI=1.
 EOF
     exit 2
 fi
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
index 1442adf9..4f6a44d9 100644
--- a/src/sampleworks/runs/runner.py
+++ b/src/sampleworks/runs/runner.py
@@ -367,9 +367,10 @@ def _require_prebuilt_envs() -> bool:
         True when the ACTL wrapper/image requires baked pixi environments and
         the caller has not explicitly opted into runtime pixi installation.
     """
-    return _truthy_env("SAMPLEWORKS_REQUIRE_PREBUILT_PIXI") and not _truthy_env(
+    allow_runtime_pixi = _truthy_env("RUNTIME_PIXI") or _truthy_env(
         "SAMPLEWORKS_ALLOW_RUNTIME_PIXI"
     )
+    return _truthy_env("SAMPLEWORKS_REQUIRE_PREBUILT_PIXI") and not allow_runtime_pixi
 
 
 def _missing_prebuilt_env_message(pixi_env: str) -> str:
@@ -391,8 +392,8 @@ def _missing_prebuilt_env_message(pixi_env: str) -> str:
         "The pixi-with-checkpoints image must contain ready-to-use boltz, "
         "protenix, and rf3 environments. Refusing to fall back to 'pixi run' "
         "because that would install or refresh packages inside the pod. "
-        "Recreate the pod with the current image, or set "
-        "SAMPLEWORKS_ALLOW_RUNTIME_PIXI=1 only when intentionally debugging pixi."
+        "Recreate the pod with the current image, or set RUNTIME_PIXI=1 only "
+        "when intentionally debugging pixi."
     )