From 38542dafc275e52d7b1dff70be5515d6e636a9bc Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 18:38:08 -0700 Subject: [PATCH 01/28] feat(runs): add sampleworks-runs preset orchestrator Adds a new `sampleworks-runs` CLI under `src/sampleworks/runs/` that launches parallel `run_grid_search.py` jobs from a single TOML preset. Five bundled presets (`all_models`, `rf3_partial`, `rf3_partial_chiral_off`, `protenix_dual`, `rf3_protenix`) cover the canonical multi-model sweeps. Each preset declares its jobs (pixi env, GPU assignment, args); the runner sets `CUDA_VISIBLE_DEVICES`, shells out via `pixi run -e `, tees per-job logs, and aggregates exit codes. Dotted-path `--set` overrides let users sweep parameters without editing TOML, e.g.: sampleworks-runs rf3_partial --set jobs.rf3.args.gradient-weights="0.0 0.01" Pure stdlib (tomllib + dataclasses + argparse + subprocess), no new deps. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 1 + README.md | 26 +++ pyproject.toml | 4 + src/sampleworks/runs/__init__.py | 5 + src/sampleworks/runs/cli.py | 122 ++++++++++++ src/sampleworks/runs/loader.py | 178 ++++++++++++++++++ src/sampleworks/runs/presets/all_models.toml | 44 +++++ .../runs/presets/protenix_dual.toml | 34 ++++ src/sampleworks/runs/presets/rf3_partial.toml | 24 +++ .../runs/presets/rf3_partial_chiral_off.toml | 26 +++ .../runs/presets/rf3_protenix.toml | 29 +++ src/sampleworks/runs/runner.py | 146 ++++++++++++++ src/sampleworks/runs/schema.py | 60 ++++++ tests/runs/__init__.py | 0 tests/runs/test_cli.py | 88 +++++++++ tests/runs/test_loader.py | 142 ++++++++++++++ tests/runs/test_runner.py | 111 +++++++++++ 17 files changed, 1040 insertions(+) create mode 100644 src/sampleworks/runs/__init__.py create mode 100644 src/sampleworks/runs/cli.py create mode 100644 src/sampleworks/runs/loader.py create mode 100644 src/sampleworks/runs/presets/all_models.toml create mode 100644 src/sampleworks/runs/presets/protenix_dual.toml create mode 100644 src/sampleworks/runs/presets/rf3_partial.toml create mode 100644 src/sampleworks/runs/presets/rf3_partial_chiral_off.toml create mode 100644 src/sampleworks/runs/presets/rf3_protenix.toml create mode 100644 src/sampleworks/runs/runner.py create mode 100644 src/sampleworks/runs/schema.py create mode 100644 tests/runs/__init__.py create mode 100644 tests/runs/test_cli.py create mode 100644 tests/runs/test_loader.py create mode 100644 tests/runs/test_runner.py diff --git a/AGENTS.md b/AGENTS.md index b56d51d6..7e1b9436 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -154,6 +154,7 @@ src/sampleworks/ ├── metrics/ # Quality metrics (LDDT, sidechain) ├── eval/ # Evaluation utilities ├── data/ # Reference data (protein configs) +├── runs/ # `sampleworks-runs` CLI + TOML preset orchestrator └── utils/ # Shared utilities ``` diff --git a/README.md b/README.md index 0b123355..efdcc537 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,32 @@ Output layout: `grid_search_results//[_]//ens Instructions for running evaluation and metrics scripts are coming soon. +## Preset experiments (`sampleworks-runs`) + +For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes. + +```bash +pixi run -e rf3 sampleworks-runs --list # bundled presets +pixi run -e rf3 sampleworks-runs rf3_partial # run a preset +pixi run -e rf3 sampleworks-runs rf3_partial --show # inspect resolved values +pixi run -e rf3 sampleworks-runs rf3_partial --dry-run # print pixi run commands, don't execute +pixi run -e rf3 sampleworks-runs all_models --only rf3,protenix # subset jobs + +# Override any value without editing the TOML: +pixi run -e rf3 sampleworks-runs rf3_partial \ + --set jobs.rf3.gpus=7 \ + --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02" +``` + +Bundled presets live in `src/sampleworks/runs/presets/*.toml`. Add a new preset by dropping a `.toml` file alongside them or pointing at any path: + +```bash +sampleworks-runs ./my_experiment.toml +``` + +Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block. + + ## Docker TODO: Docker container documentation diff --git a/pyproject.toml b/pyproject.toml index 613a784b..5b97b9fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,10 @@ version = "0.6.2" [project.scripts] sampleworks-guidance = "sampleworks.cli.guidance:main" +sampleworks-runs = "sampleworks.runs.cli:main" + +[tool.hatch.build.targets.wheel.force-include] +"src/sampleworks/runs/presets" = "sampleworks/runs/presets" [tool.hatch.metadata] allow-direct-references = true diff --git a/src/sampleworks/runs/__init__.py b/src/sampleworks/runs/__init__.py new file mode 100644 index 00000000..8bb071ac --- /dev/null +++ b/src/sampleworks/runs/__init__.py @@ -0,0 +1,5 @@ +"""Preset-driven orchestrator for parallel run_grid_search.py invocations. + +Replaces the previous ACTL-native bash wrapper scripts with TOML presets + +a thin Python runner. See ``sampleworks-runs --help``. +""" diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py new file mode 100644 index 00000000..5a76f4e2 --- /dev/null +++ b/src/sampleworks/runs/cli.py @@ -0,0 +1,122 @@ +"""Command-line entry point for ``sampleworks-runs``.""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from . import loader, runner +from .schema import Preset + + +def main(argv: list[str] | None = None) -> int: + parser = _build_parser() + args = parser.parse_args(argv) + + if args.list: + for name in loader.list_bundled_presets(): + print(name) + return 0 + + if args.preset is None: + parser.error("PRESET is required (or pass --list)") + + preset = loader.load_preset(args.preset, overrides=args.set) + if args.only: + preset = _filter_only(preset, args.only) + + if args.show: + _print_show(preset) + return 0 + + results_dir = Path(args.results_dir or _default_results_dir(preset)) + return runner.run(preset, results_dir=results_dir, dry_run=args.dry_run) + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="sampleworks-runs", + description=( + "Run a preset of parallel run_grid_search.py jobs. " + "Presets are TOML files bundled under sampleworks.runs.presets, " + "or pass a path to a .toml file directly." + ), + ) + parser.add_argument("preset", nargs="?", help="Bundled preset name or path to a .toml file") + parser.add_argument("--list", action="store_true", help="List bundled presets and exit") + parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit") + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the pixi run commands instead of executing them", + ) + parser.add_argument( + "--only", + default="", + help="Comma-separated job names to run (subset). Default: all jobs.", + ) + parser.add_argument( + "--set", + action="append", + default=[], + metavar="DOTTED_KEY=VALUE", + help=( + "Override a value in the loaded preset. Examples: " + "--set defaults.DATA_DIR=/data/foo, " + "--set jobs.rf3.args.gradient-weights='0.0 0.01', " + "--set jobs.0.gpus=5" + ), + ) + parser.add_argument( + "--results-dir", + default=None, + help="Override RESULTS_DIR for this run (also controls per-job log location).", + ) + return parser + + +def _filter_only(preset: Preset, only: str) -> Preset: + names = [n.strip() for n in only.split(",") if n.strip()] + keep = [j for j in preset.jobs if j.name in names] + missing = set(names) - {j.name for j in keep} + if missing: + raise SystemExit(f"--only references unknown jobs: {sorted(missing)}") + return Preset( + name=preset.name, + description=preset.description, + defaults=preset.defaults, + jobs=keep, + ) + + +def _print_show(preset: Preset) -> None: + print(f"name: {preset.name}") + if preset.description: + print(f"description: {preset.description}") + if preset.defaults: + print("defaults:") + for k, v in preset.defaults.items(): + print(f" {k} = {v}") + print("jobs:") + for j in preset.jobs: + print(f" - name: {j.name}") + print(f" env: {j.env}") + print(f" gpus: {j.gpus}") + print(f" output_subdir: {j.output_subdir}") + print(" args:") + for k, v in j.args.items(): + print(f" {k} = {v!r}") + + +def _default_results_dir(preset: Preset) -> str: + return ( + preset.defaults.get("RESULTS_DIR") + or os.environ.get("RESULTS_DIR") + or "./grid_search_results" + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py new file mode 100644 index 00000000..328b4155 --- /dev/null +++ b/src/sampleworks/runs/loader.py @@ -0,0 +1,178 @@ +"""Load presets from TOML and apply runtime overrides. + +Resolution order for every string value (defaults block and ``args``): + 1. ``${VAR}`` references are resolved against the process environment, + with the preset's ``[defaults]`` block filling in any unset keys. + 2. ``--set =`` CLI overrides are applied last. +""" + +from __future__ import annotations + +import os +import re +import tomllib +from collections.abc import Iterable +from importlib import resources +from pathlib import Path +from typing import Any + +from .schema import Job, Preset + +_BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets" +_VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") + + +def list_bundled_presets() -> list[str]: + """Return the names (sans ``.toml``) of bundled presets, sorted.""" + files = resources.files(_BUNDLED_PRESETS_PACKAGE) + return sorted(p.name.removesuffix(".toml") for p in files.iterdir() if p.name.endswith(".toml")) + + +def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset: + """Load a preset by bundled name or filesystem path, applying ``--set`` overrides.""" + raw = _read_toml(name_or_path) + overrides_list = list(overrides) + raw = _apply_overrides(raw, overrides_list) + raw = _resolve_variables(raw) + return _build_preset(name=_preset_name(name_or_path), raw=raw) + + +def _read_toml(name_or_path: str) -> dict[str, Any]: + path = Path(name_or_path) + if path.suffix == ".toml" and path.exists(): + return tomllib.loads(path.read_text()) + bundled = resources.files(_BUNDLED_PRESETS_PACKAGE) / f"{name_or_path}.toml" + if not bundled.is_file(): + raise FileNotFoundError( + f"No preset {name_or_path!r}. Bundled: {list_bundled_presets()}. " + f"Or pass a path to a .toml file." + ) + return tomllib.loads(bundled.read_text()) + + +def _preset_name(name_or_path: str) -> str: + return Path(name_or_path).stem if name_or_path.endswith(".toml") else name_or_path + + +def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any]: + for spec in overrides: + if "=" not in spec: + raise ValueError(f"--set expects KEY=VALUE, got {spec!r}") + key, value = spec.split("=", 1) + _set_dotted(raw, key.strip(), _coerce(value)) + return raw + + +def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None: + """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``.""" + parts = dotted.split(".") + cursor: Any = obj + for i, part in enumerate(parts[:-1]): + cursor = _index(cursor, part, where=".".join(parts[: i + 1])) + leaf_parent = cursor + leaf_key = parts[-1] + if isinstance(leaf_parent, list): + leaf_parent[_find_in_list(leaf_parent, leaf_key, where=dotted)] = value + else: + leaf_parent[leaf_key] = value + + +def _index(cursor: Any, part: str, *, where: str) -> Any: + if isinstance(cursor, list): + return cursor[_find_in_list(cursor, part, where=where)] + if isinstance(cursor, dict): + if part not in cursor: + cursor[part] = {} + return cursor[part] + raise TypeError(f"Cannot descend into {type(cursor).__name__} at {where!r}") + + +def _find_in_list(items: list[Any], key: str, *, where: str) -> int: + if key.isdigit() or (key.startswith("-") and key[1:].isdigit()): + return int(key) + for i, item in enumerate(items): + if isinstance(item, dict) and item.get("name") == key: + return i + raise KeyError(f"No list element named {key!r} at {where!r}") + + +def _coerce(value: str) -> Any: + if value.lower() in ("true", "false"): + return value.lower() == "true" + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + return value + + +def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]: + """Expand ``${VAR}`` in every string. Env wins; defaults block fills gaps. + + Defaults are resolved in TOML order, so later defaults can reference earlier ones + (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``). + """ + defaults: dict[str, str] = dict(raw.get("defaults", {})) + accumulated: dict[str, str] = dict(os.environ) + resolved_defaults: dict[str, str] = {} + for key, default_value in defaults.items(): + if key in os.environ: + resolved_defaults[key] = os.environ[key] + else: + resolved_defaults[key] = _expand(default_value, accumulated) + accumulated[key] = resolved_defaults[key] + resolved = _walk(raw, accumulated) + resolved["defaults"] = resolved_defaults + return resolved + + +def _walk(obj: Any, env: dict[str, str]) -> Any: + if isinstance(obj, dict): + return {k: _walk(v, env) for k, v in obj.items()} + if isinstance(obj, list): + return [_walk(item, env) for item in obj] + if isinstance(obj, str): + return _expand(obj, env) + return obj + + +def _expand(text: str, env: dict[str, str]) -> str: + def repl(match: re.Match[str]) -> str: + var = match.group(1) + if var not in env: + raise KeyError(f"Undefined variable ${{{var}}} in preset (no env var, no default)") + return env[var] + + prev = None + current = text + while prev != current: + prev = current + current = _VAR_PATTERN.sub(repl, current) + return current + + +def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset: + raw_jobs = raw.get("jobs", []) + if not isinstance(raw_jobs, list): + raise ValueError(f"Preset {name!r}: 'jobs' must be a list") + jobs = [ + Job( + name=str(j["name"]), + env=str(j["env"]), + gpus=str(j["gpus"]), + output_subdir=str(j["output_subdir"]), + args=dict(j.get("args", {})), + ) + for j in raw_jobs + ] + return Preset( + name=name, + description=str(raw.get("description", "")), + defaults=dict(raw.get("defaults", {})), + shared_args=dict(raw.get("shared_args", {})), + jobs=jobs, + ) diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml new file mode 100644 index 00000000..21d1cd92 --- /dev/null +++ b/src/sampleworks/runs/presets/all_models.toml @@ -0,0 +1,44 @@ +description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)." + +[defaults] +DATA_DIR = "/data/input" +RESULTS_DIR = "/data/results" +MSA_CACHE_DIR = "${HOME}/.sampleworks/msa" +PROTEINS_CSV = "${DATA_DIR}/proteins.csv" + +[shared_args] +proteins = "${PROTEINS_CSV}" +scalers = "pure_guidance" +partial-diffusion-step = 120 +ensemble-sizes = "8" +gradient-normalization = true +augmentation = true +align-to-input = true + +[[jobs]] +name = "boltz2_xrd" +env = "boltz" +gpus = "0,1" +output_subdir = "boltz2_xrd" +args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" } + +[[jobs]] +name = "boltz2_md" +env = "boltz" +gpus = "2,3" +output_subdir = "boltz2_md" +args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" } + +[[jobs]] +name = "rf3" +env = "rf3" +gpus = "4,5" +output_subdir = "rf3" +args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1" } + +[[jobs]] +name = "protenix" +env = "protenix" +gpus = "6,7" +output_subdir = "protenix" +args = { model = "protenix", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" } diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml new file mode 100644 index 00000000..461d547e --- /dev/null +++ b/src/sampleworks/runs/presets/protenix_dual.toml @@ -0,0 +1,34 @@ +description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)." + +[defaults] +DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results" +MSA_CACHE_DIR = "/data/sampleworks-exp/msa_cache" +PROTEINS_CSV = "${DATA_DIR}/proteins.csv" +PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt" +PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt" + +[shared_args] +proteins = "${PROTEINS_CSV}" +model = "protenix" +scalers = "pure_guidance" +partial-diffusion-step = 120 +ensemble-sizes = "8" +gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" +gradient-normalization = true +augmentation = true +align-to-input = true + +[[jobs]] +name = "protenix_tiny" +env = "protenix" +gpus = "2,3" +output_subdir = "protenix_tiny" +args = { model-checkpoint = "${PROTENIX_TINY_CHECKPOINT}" } + +[[jobs]] +name = "protenix_mini" +env = "protenix" +gpus = "6,7" +output_subdir = "protenix_mini" +args = { model-checkpoint = "${PROTENIX_MINI_CHECKPOINT}" } diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml new file mode 100644 index 00000000..533accfb --- /dev/null +++ b/src/sampleworks/runs/presets/rf3_partial.toml @@ -0,0 +1,24 @@ +description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)." + +[defaults] +DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results" +MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +PROTEINS_CSV = "${DATA_DIR}/proteins.csv" +RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" + +[shared_args] +proteins = "${PROTEINS_CSV}" +scalers = "pure_guidance" +partial-diffusion-step = 120 +ensemble-sizes = "8" +gradient-normalization = true +augmentation = true +align-to-input = true + +[[jobs]] +name = "rf3" +env = "rf3" +gpus = "4" +output_subdir = "rf3" +args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1", model-checkpoint = "${RF3_CHECKPOINT}" } diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml new file mode 100644 index 00000000..c1f34820 --- /dev/null +++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml @@ -0,0 +1,26 @@ +description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep." + +[defaults] +DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results_rf3_chiral_off" +MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +PROTEINS_CSV = "${DATA_DIR}/proteins.csv" +RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" + +[shared_args] +proteins = "${PROTEINS_CSV}" +scalers = "pure_guidance" +partial-diffusion-step = 120 +ensemble-sizes = "8" +gradient-normalization = true +augmentation = true +align-to-input = true +force-all = true +disable-chiral-features = true + +[[jobs]] +name = "rf3" +env = "rf3" +gpus = "5" +output_subdir = "." +args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" } diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml new file mode 100644 index 00000000..9255cfea --- /dev/null +++ b/src/sampleworks/runs/presets/rf3_protenix.toml @@ -0,0 +1,29 @@ +description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)." + +[defaults] +DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results" +MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +PROTEINS_CSV = "${DATA_DIR}/proteins.csv" + +[shared_args] +proteins = "${PROTEINS_CSV}" +scalers = "pure_guidance" +ensemble-sizes = "8" +gradient-normalization = true +augmentation = true +align-to-input = true + +[[jobs]] +name = "rf3" +env = "rf3" +gpus = "0,1,2,3" +output_subdir = "rf3" +args = { model = "rf3", gradient-weights = "0.0 0.01 0.02 0.05 0.1" } + +[[jobs]] +name = "protenix" +env = "protenix" +gpus = "4,5,6,7" +output_subdir = "protenix" +args = { model = "protenix", partial-diffusion-step = 120, gradient-weights = "0.0 0.1 0.2 0.5" } diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py new file mode 100644 index 00000000..2623ba22 --- /dev/null +++ b/src/sampleworks/runs/runner.py @@ -0,0 +1,146 @@ +"""Build job argv and orchestrate parallel subprocess execution.""" + +from __future__ import annotations + +import os +import shlex +import subprocess +import sys +import threading +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from .schema import Job, Preset + +GRID_SEARCH_SCRIPT = "/app/run_grid_search.py" + + +@dataclass +class JobInvocation: + job: Job + argv: list[str] + env: dict[str, str] + log_path: Path + + +def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]: + """Build the subprocess argv + env + log path for every job in the preset.""" + invocations: list[JobInvocation] = [] + for job in preset.jobs: + args = preset.effective_args(job) + args.setdefault("output-dir", str(results_dir / job.output_subdir)) + argv = _build_argv(job.env, args) + env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus} + log_path = results_dir / f"{job.name}_run.log" + invocations.append(JobInvocation(job=job, argv=argv, env=env, log_path=log_path)) + return invocations + + +def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]: + argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT] + for key, value in args.items(): + flag = f"--{key}" + if isinstance(value, bool): + if value: + argv.append(flag) + elif value is None: + continue + else: + argv.extend([flag, str(value)]) + return argv + + +def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: + """Launch every job in parallel; tee output to per-job logs; return 0 iff all succeed.""" + results_dir.mkdir(parents=True, exist_ok=True) + invocations = build_invocations(preset, results_dir=results_dir) + + if dry_run: + for inv in invocations: + _print_dry_run(inv) + return 0 + + _print_launch_summary(preset, invocations) + processes = [_spawn(inv) for inv in invocations] + return _wait_all(processes) + + +def _print_dry_run(inv: JobInvocation) -> None: + print(f"# job: {inv.job.name} (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr) + print(f"# log: {inv.log_path}", file=sys.stderr) + print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}") + print(file=sys.stderr) + + +def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> None: + bar = "=" * 60 + print(bar, file=sys.stderr) + print(f"preset: {preset.name}", file=sys.stderr) + if preset.description: + print(f" {preset.description}", file=sys.stderr) + for inv in invocations: + print( + f" - {inv.job.name}: env={inv.job.env}, gpus={inv.job.gpus}, log={inv.log_path}", + file=sys.stderr, + ) + print(bar, file=sys.stderr) + + +@dataclass +class _RunningJob: + inv: JobInvocation + proc: subprocess.Popen[bytes] + tee_thread: threading.Thread + + +def _spawn(inv: JobInvocation) -> _RunningJob: + inv.log_path.parent.mkdir(parents=True, exist_ok=True) + log_file = open(inv.log_path, "wb") + proc = subprocess.Popen( + inv.argv, + env=inv.env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=0, + ) + assert proc.stdout is not None + thread = threading.Thread( + target=_tee, + args=(inv.job.name, proc.stdout, log_file), + daemon=True, + ) + thread.start() + print(f"[{_ts()}] launched {inv.job.name} (pid {proc.pid})", file=sys.stderr) + return _RunningJob(inv=inv, proc=proc, tee_thread=thread) + + +def _wait_all(jobs: list[_RunningJob]) -> int: + failures = 0 + for j in jobs: + exit_code = j.proc.wait() + j.tee_thread.join() + if exit_code == 0: + print(f"[{_ts()}] {j.inv.job.name} succeeded", file=sys.stderr) + else: + print(f"[{_ts()}] {j.inv.job.name} FAILED (exit {exit_code})", file=sys.stderr) + failures += 1 + return 0 if failures == 0 else 1 + + +def _tee(prefix: str, src: Any, dest: Any) -> None: + for line in iter(src.readline, b""): + dest.write(line) + dest.flush() + sys.stderr.write(f"[{prefix}] {line.decode('utf-8', errors='replace')}") + sys.stderr.flush() + dest.close() + + +def _ts() -> str: + return time.strftime("%Y-%m-%d %H:%M:%S") + + +def _shell_join(argv: list[str]) -> str: + return shlex.join(argv) diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py new file mode 100644 index 00000000..216a6bc0 --- /dev/null +++ b/src/sampleworks/runs/schema.py @@ -0,0 +1,60 @@ +"""Dataclasses for the preset schema. + +A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job +is launched as ``pixi run -e python /app/run_grid_search.py `` with +``CUDA_VISIBLE_DEVICES`` set to the job's GPU assignment. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +VALID_PIXI_ENVS = ("boltz", "protenix", "rf3") + + +@dataclass +class Job: + name: str + env: str + gpus: str + output_subdir: str + args: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + if self.env not in VALID_PIXI_ENVS: + raise ValueError( + f"Job {self.name!r}: env must be one of {VALID_PIXI_ENVS}, got {self.env!r}" + ) + if not self.gpus: + raise ValueError(f"Job {self.name!r}: gpus must be non-empty") + if not self.output_subdir: + raise ValueError(f"Job {self.name!r}: output_subdir must be non-empty") + + +@dataclass +class Preset: + name: str + description: str + defaults: dict[str, str] = field(default_factory=dict) + shared_args: dict[str, Any] = field(default_factory=dict) + jobs: list[Job] = field(default_factory=list) + + def __post_init__(self) -> None: + if not self.jobs: + raise ValueError(f"Preset {self.name!r}: must declare at least one job") + seen: set[str] = set() + for job in self.jobs: + if job.name in seen: + raise ValueError(f"Preset {self.name!r}: duplicate job name {job.name!r}") + seen.add(job.name) + + def job(self, name: str) -> Job: + for j in self.jobs: + if j.name == name: + return j + raise KeyError(f"Preset {self.name!r} has no job {name!r}") + + def effective_args(self, job: Job) -> dict[str, Any]: + """Return ``shared_args`` merged with per-job overrides.""" + return {**self.shared_args, **job.args} diff --git a/tests/runs/__init__.py b/tests/runs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py new file mode 100644 index 00000000..c93afe2c --- /dev/null +++ b/tests/runs/test_cli.py @@ -0,0 +1,88 @@ +"""End-to-end CLI tests (--list, --show, --dry-run, --only).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from sampleworks.runs import cli + + +def test_list_prints_all_bundled_presets(capsys: pytest.CaptureFixture[str]) -> None: + exit_code = cli.main(["--list"]) + assert exit_code == 0 + out = capsys.readouterr().out.splitlines() + assert set(out) == { + "all_models", + "rf3_partial", + "rf3_partial_chiral_off", + "protenix_dual", + "rf3_protenix", + } + + +def test_show_prints_resolved_preset( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setenv("HOME", "/home/test") + exit_code = cli.main(["rf3_partial", "--show"]) + assert exit_code == 0 + out = capsys.readouterr().out + assert "name: rf3_partial" in out + assert "gradient-weights" in out + + +def test_dry_run_does_not_invoke_subprocess( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setenv("HOME", str(tmp_path)) + exit_code = cli.main( + ["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)] + ) + assert exit_code == 0 + out = capsys.readouterr().out + assert "pixi run -e rf3 python /app/run_grid_search.py" in out + assert "CUDA_VISIBLE_DEVICES=4" in out + + +def test_only_filters_to_subset( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setenv("HOME", "/home/test") + exit_code = cli.main(["all_models", "--only", "rf3,protenix", "--show"]) + assert exit_code == 0 + out = capsys.readouterr().out + assert "name: rf3" in out + assert "name: protenix" in out + assert "boltz2_xrd" not in out + assert "boltz2_md" not in out + + +def test_only_with_unknown_job_errors(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + with pytest.raises(SystemExit, match="unknown jobs"): + cli.main(["all_models", "--only", "nonexistent", "--show"]) + + +def test_set_override_propagates_through_cli( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setenv("HOME", "/home/test") + exit_code = cli.main( + [ + "rf3_partial", + "--set", + "jobs.rf3.args.gradient-weights=0.0 0.01", + "--show", + ] + ) + assert exit_code == 0 + out = capsys.readouterr().out + assert "0.0 0.01" in out + + +def test_no_preset_and_no_list_errors(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + with pytest.raises(SystemExit): + cli.main([]) diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py new file mode 100644 index 00000000..66eeb5b9 --- /dev/null +++ b/tests/runs/test_loader.py @@ -0,0 +1,142 @@ +"""Unit tests for sampleworks.runs.loader.""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pytest + +from sampleworks.runs import loader +from sampleworks.runs.schema import Preset + + +BUNDLED = ["all_models", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"] + + +def test_list_bundled_presets_returns_the_five() -> None: + names = loader.list_bundled_presets() + assert set(names) == set(BUNDLED), f"unexpected bundled presets: {names}" + + +@pytest.mark.parametrize("name", BUNDLED) +def test_each_bundled_preset_loads(name: str, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset(name) + assert preset.name == name + assert preset.jobs, f"{name} has no jobs" + for job in preset.jobs: + assert job.env in ("boltz", "protenix", "rf3") + + +def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + monkeypatch.setenv("DATA_DIR", "/from/env") + preset = loader.load_preset("rf3_partial") + assert preset.defaults["DATA_DIR"] == "/from/env" + rf3 = preset.job("rf3") + # PROTEINS_CSV expands to ${DATA_DIR}/proteins.csv; DATA_DIR overridden by env + proteins = preset.shared_args["proteins"] + assert proteins == "/from/env/proteins.csv" + + +def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DATA_DIR", raising=False) + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("rf3_partial") + assert preset.defaults["DATA_DIR"] == "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" + + +def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("DATA_DIR", raising=False) + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("rf3_partial", overrides=["defaults.DATA_DIR=/custom"]) + assert preset.defaults["DATA_DIR"] == "/custom" + assert preset.shared_args["proteins"] == "/custom/proteins.csv" + + +def test_set_override_at_job_by_name(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("all_models", overrides=["jobs.rf3.gpus=7"]) + assert preset.job("rf3").gpus == "7" + + +def test_set_override_at_job_by_index(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("all_models", overrides=["jobs.0.gpus=9"]) + assert preset.jobs[0].gpus == "9" + + +def test_set_override_at_args_inside_job(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset( + "rf3_partial", overrides=["jobs.rf3.args.gradient-weights=0.0 0.01"] + ) + assert preset.job("rf3").args["gradient-weights"] == "0.0 0.01" + + +def test_set_coerces_bool_and_int(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset( + "rf3_partial", + overrides=[ + "shared_args.gradient-normalization=false", + "jobs.rf3.args.partial-diffusion-step=200", + ], + ) + assert preset.shared_args["gradient-normalization"] is False + # job.args["partial-diffusion-step"] doesn't exist by default in rf3_partial, + # but --set should still create or override it + assert preset.job("rf3").args["partial-diffusion-step"] == 200 + + +def test_load_preset_from_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + custom = tmp_path / "mycustom.toml" + custom.write_text( + 'description = "custom"\n' + "[defaults]\n" + 'DATA_DIR = "/x"\n' + "[shared_args]\n" + 'model = "rf3"\n' + "[[jobs]]\n" + 'name = "j1"\n' + 'env = "rf3"\n' + 'gpus = "0"\n' + 'output_subdir = "j1"\n' + "args = {}\n" + ) + preset = loader.load_preset(str(custom)) + assert preset.name == "mycustom" + assert preset.defaults["DATA_DIR"] == "/x" + + +def test_unknown_preset_raises() -> None: + with pytest.raises(FileNotFoundError): + loader.load_preset("does_not_exist") + + +def test_undefined_variable_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + bad = tmp_path / "bad.toml" + bad.write_text( + '[shared_args]\nproteins = "${NEVER_DEFINED_VAR}/x"\n' + '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n' + ) + monkeypatch.delenv("NEVER_DEFINED_VAR", raising=False) + with pytest.raises(KeyError, match="NEVER_DEFINED_VAR"): + loader.load_preset(str(bad)) + + +def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + with pytest.raises(ValueError, match="KEY=VALUE"): + loader.load_preset("rf3_partial", overrides=["bogus_no_equals"]) + + +def test_bad_env_rejected(tmp_path: Path) -> None: + bad = tmp_path / "bad.toml" + bad.write_text( + '[[jobs]]\nname = "j"\nenv = "not_a_real_env"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n' + ) + with pytest.raises(ValueError, match="env must be one of"): + loader.load_preset(str(bad)) diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py new file mode 100644 index 00000000..4ac31ab9 --- /dev/null +++ b/tests/runs/test_runner.py @@ -0,0 +1,111 @@ +"""Unit tests for sampleworks.runs.runner argv builder.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from sampleworks.runs import loader, runner + + +def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> None: + """Faithful translation: argv should match the canonical rf3_partial bash invocation.""" + monkeypatch.setenv("HOME", "/home/test") + monkeypatch.delenv("DATA_DIR", raising=False) + monkeypatch.delenv("RESULTS_DIR", raising=False) + preset = loader.load_preset("rf3_partial") + invocations = runner.build_invocations(preset, results_dir=Path("/results")) + + assert len(invocations) == 1 + inv = invocations[0] + assert inv.job.name == "rf3" + assert inv.env["CUDA_VISIBLE_DEVICES"] == "4" + assert inv.log_path == Path("/results/rf3_run.log") + + argv = inv.argv + assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"] + pairs = _argv_to_dict(argv[6:]) + assert pairs["--proteins"] == ( + "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv" + ) + assert pairs["--model"] == "rf3" + assert pairs["--scalers"] == "pure_guidance" + assert pairs["--partial-diffusion-step"] == "120" + assert pairs["--ensemble-sizes"] == "8" + assert pairs["--gradient-weights"] == "0.0 0.005 0.01 0.02 0.035 0.05 0.1" + assert pairs["--model-checkpoint"] == "/checkpoints/rf3_foundry_01_24_latest.ckpt" + assert pairs["--output-dir"] == "/results/rf3" + # store_true flags appear as bare keys (value=True in our dict) + assert pairs["--gradient-normalization"] is True + assert pairs["--augmentation"] is True + assert pairs["--align-to-input"] is True + + +def test_argv_omits_false_bool_flags(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset( + "rf3_partial", overrides=["shared_args.gradient-normalization=false"] + ) + inv = runner.build_invocations(preset, results_dir=Path("/results"))[0] + assert "--gradient-normalization" not in inv.argv + + +def test_explicit_output_dir_in_args_wins_over_subdir_default( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("HOME", "/home/test") + custom = tmp_path / "custom.toml" + custom.write_text( + "[shared_args]\n" + '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "sub"\n' + 'args = { "output-dir" = "/explicit/path" }\n' + ) + preset = loader.load_preset(str(custom)) + inv = runner.build_invocations(preset, results_dir=Path("/results"))[0] + pairs = _argv_to_dict(inv.argv[6:]) + assert pairs["--output-dir"] == "/explicit/path" + + +def test_all_models_has_four_jobs_with_distinct_gpus( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("all_models") + invocations = runner.build_invocations(preset, results_dir=Path("/r")) + assert [i.job.name for i in invocations] == ["boltz2_xrd", "boltz2_md", "rf3", "protenix"] + gpu_assignments = [i.env["CUDA_VISIBLE_DEVICES"] for i in invocations] + assert gpu_assignments == ["0,1", "2,3", "4,5", "6,7"] + + +def test_protenix_dual_uses_different_checkpoints(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("protenix_dual") + invocations = runner.build_invocations(preset, results_dir=Path("/r")) + pairs = [_argv_to_dict(i.argv[6:]) for i in invocations] + assert pairs[0]["--model-checkpoint"] == "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt" + assert pairs[1]["--model-checkpoint"] == "/extra_checkpoints/protenix_mini_default_v0.5.0.pt" + + +def test_rf3_partial_chiral_off_flag_present(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("rf3_partial_chiral_off") + inv = runner.build_invocations(preset, results_dir=Path("/r"))[0] + assert "--disable-chiral-features" in inv.argv + assert "--force-all" in inv.argv + + +def _argv_to_dict(tail: list[str]) -> dict[str, object]: + """Turn ``[--a, 1, --b, --c, 2]`` into ``{'--a': '1', '--b': True, '--c': '2'}``.""" + out: dict[str, object] = {} + i = 0 + while i < len(tail): + flag = tail[i] + assert flag.startswith("--"), f"unexpected positional: {flag}" + if i + 1 < len(tail) and not tail[i + 1].startswith("--"): + out[flag] = tail[i + 1] + i += 2 + else: + out[flag] = True + i += 1 + return out From f25a71eea84b7a991b42f551caba8cce9f746767 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 21:06:19 -0700 Subject: [PATCH 02/28] style(runs): fix ruff lint findings Ruff auto-fixed import ordering (I001) and unused imports (F401); two remaining were a leftover unused local `rf3 = preset.job("rf3")` (F841) and one over-100-char assertion (E501) extracted into a local. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sampleworks/runs/loader.py | 1 + src/sampleworks/runs/runner.py | 1 + src/sampleworks/runs/schema.py | 1 + tests/runs/test_cli.py | 1 - tests/runs/test_loader.py | 7 ++----- tests/runs/test_runner.py | 1 - 6 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py index 328b4155..906abcca 100644 --- a/src/sampleworks/runs/loader.py +++ b/src/sampleworks/runs/loader.py @@ -18,6 +18,7 @@ from .schema import Job, Preset + _BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets" _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py index 2623ba22..87b918b0 100644 --- a/src/sampleworks/runs/runner.py +++ b/src/sampleworks/runs/runner.py @@ -14,6 +14,7 @@ from .schema import Job, Preset + GRID_SEARCH_SCRIPT = "/app/run_grid_search.py" diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py index 216a6bc0..2f50a3e9 100644 --- a/src/sampleworks/runs/schema.py +++ b/src/sampleworks/runs/schema.py @@ -10,6 +10,7 @@ from dataclasses import dataclass, field from typing import Any + VALID_PIXI_ENVS = ("boltz", "protenix", "rf3") diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py index c93afe2c..0e22cd5f 100644 --- a/tests/runs/test_cli.py +++ b/tests/runs/test_cli.py @@ -5,7 +5,6 @@ from pathlib import Path import pytest - from sampleworks.runs import cli diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py index 66eeb5b9..c828b7c9 100644 --- a/tests/runs/test_loader.py +++ b/tests/runs/test_loader.py @@ -2,13 +2,10 @@ from __future__ import annotations -import os from pathlib import Path import pytest - from sampleworks.runs import loader -from sampleworks.runs.schema import Preset BUNDLED = ["all_models", "rf3_partial", "rf3_partial_chiral_off", "protenix_dual", "rf3_protenix"] @@ -34,7 +31,6 @@ def test_env_var_wins_over_defaults_block(monkeypatch: pytest.MonkeyPatch) -> No monkeypatch.setenv("DATA_DIR", "/from/env") preset = loader.load_preset("rf3_partial") assert preset.defaults["DATA_DIR"] == "/from/env" - rf3 = preset.job("rf3") # PROTEINS_CSV expands to ${DATA_DIR}/proteins.csv; DATA_DIR overridden by env proteins = preset.shared_args["proteins"] assert proteins == "/from/env/proteins.csv" @@ -44,7 +40,8 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DATA_DIR", raising=False) monkeypatch.setenv("HOME", "/home/test") preset = loader.load_preset("rf3_partial") - assert preset.defaults["DATA_DIR"] == "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" + expected = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" + assert preset.defaults["DATA_DIR"] == expected def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None: diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py index 4ac31ab9..050d7458 100644 --- a/tests/runs/test_runner.py +++ b/tests/runs/test_runner.py @@ -5,7 +5,6 @@ from pathlib import Path import pytest - from sampleworks.runs import loader, runner From c3f0d12fec2eeba9ba184a6a2e1d7662854ff751 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 21:11:46 -0700 Subject: [PATCH 03/28] style(runs): apply ruff format CI's lint job runs both `ruff check` and `ruff format --check`. The prior commit fixed only the former. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/runs/test_cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py index 0e22cd5f..1023f0b3 100644 --- a/tests/runs/test_cli.py +++ b/tests/runs/test_cli.py @@ -36,9 +36,7 @@ def test_dry_run_does_not_invoke_subprocess( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: monkeypatch.setenv("HOME", str(tmp_path)) - exit_code = cli.main( - ["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)] - ) + exit_code = cli.main(["rf3_partial", "--dry-run", "--results-dir", str(tmp_path)]) assert exit_code == 0 out = capsys.readouterr().out assert "pixi run -e rf3 python /app/run_grid_search.py" in out From d5d065a3ff8f3699e303238bcec2091cde057491 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 21:17:15 -0700 Subject: [PATCH 04/28] fix(runs): address CodeRabbit review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cli.py: `--only` filter now preserves `shared_args`; previously the filtered Preset dropped them, silently losing every job's shared flags. - loader.py: `--set` now rejects unknown top-level keys. A typo like `--set job.rf3.gpus=0` (note missing 's') used to auto-create an unused `job` dict and silently no-op; now it raises KeyError with the valid keys listed. - runner.py: handle partial spawn failures. If one job fails to spawn midway, already-launched jobs are terminated and joined instead of orphaned. Also wraps Popen in try/close to avoid log-file handle leak if the subprocess fails to start. - presets/rf3_partial_chiral_off.toml: change `output_subdir = "."` (write to RESULTS_DIR root) to `"rf3"` (subdir under RESULTS_DIR) for consistency with the other RF3 presets and collision safety when RESULTS_DIR is overridden to a shared location. Skipped (faithful to bash original): rf3_protenix.toml asymmetric partial-diffusion-step — the source `run_rf3_protenix_mdc_actl.sh` deliberately set it only for the Protenix job, not RF3. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sampleworks/runs/cli.py | 1 + src/sampleworks/runs/loader.py | 8 +++++ .../runs/presets/rf3_partial_chiral_off.toml | 2 +- src/sampleworks/runs/runner.py | 36 ++++++++++++++----- tests/runs/test_loader.py | 7 ++++ 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py index 5a76f4e2..75b3ab2e 100644 --- a/src/sampleworks/runs/cli.py +++ b/src/sampleworks/runs/cli.py @@ -87,6 +87,7 @@ def _filter_only(preset: Preset, only: str) -> Preset: name=preset.name, description=preset.description, defaults=preset.defaults, + shared_args=preset.shared_args, jobs=keep, ) diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py index 906abcca..65be05ac 100644 --- a/src/sampleworks/runs/loader.py +++ b/src/sampleworks/runs/loader.py @@ -64,9 +64,17 @@ def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any return raw +_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"}) + + def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None: """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``.""" parts = dotted.split(".") + if parts[0] not in _TOP_LEVEL_KEYS: + raise KeyError( + f"--set: unknown top-level key {parts[0]!r} in {dotted!r}. " + f"Valid top-level keys: {sorted(_TOP_LEVEL_KEYS)}" + ) cursor: Any = obj for i, part in enumerate(parts[:-1]): cursor = _index(cursor, part, where=".".join(parts[: i + 1])) diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml index c1f34820..bd3a1311 100644 --- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml +++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml @@ -22,5 +22,5 @@ disable-chiral-features = true name = "rf3" env = "rf3" gpus = "5" -output_subdir = "." +output_subdir = "rf3" args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" } diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py index 87b918b0..4e273afa 100644 --- a/src/sampleworks/runs/runner.py +++ b/src/sampleworks/runs/runner.py @@ -64,10 +64,26 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: return 0 _print_launch_summary(preset, invocations) - processes = [_spawn(inv) for inv in invocations] + processes: list[_RunningJob] = [] + try: + for inv in invocations: + processes.append(_spawn(inv)) + except BaseException: + _terminate_all(processes) + raise return _wait_all(processes) +def _terminate_all(jobs: list[_RunningJob]) -> None: + """Terminate any already-launched jobs (used when a later spawn fails).""" + for j in jobs: + if j.proc.poll() is None: + j.proc.terminate() + for j in jobs: + j.proc.wait() + j.tee_thread.join() + + def _print_dry_run(inv: JobInvocation) -> None: print(f"# job: {inv.job.name} (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr) print(f"# log: {inv.log_path}", file=sys.stderr) @@ -99,13 +115,17 @@ class _RunningJob: def _spawn(inv: JobInvocation) -> _RunningJob: inv.log_path.parent.mkdir(parents=True, exist_ok=True) log_file = open(inv.log_path, "wb") - proc = subprocess.Popen( - inv.argv, - env=inv.env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - bufsize=0, - ) + try: + proc = subprocess.Popen( + inv.argv, + env=inv.env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=0, + ) + except BaseException: + log_file.close() + raise assert proc.stdout is not None thread = threading.Thread( target=_tee, diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py index c828b7c9..d8d6509c 100644 --- a/tests/runs/test_loader.py +++ b/tests/runs/test_loader.py @@ -130,6 +130,13 @@ def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None: loader.load_preset("rf3_partial", overrides=["bogus_no_equals"]) +def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch) -> None: + """Typos like ``--set job.rf3.gpus=0`` (missing 's' in jobs) must not silently no-op.""" + monkeypatch.setenv("HOME", "/home/test") + with pytest.raises(KeyError, match="unknown top-level key"): + loader.load_preset("rf3_partial", overrides=["job.rf3.gpus=0"]) + + def test_bad_env_rejected(tmp_path: Path) -> None: bad = tmp_path / "bad.toml" bad.write_text( From a89be9d2eea84bb971de5c2e0eea019decc5bb53 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 21:22:52 -0700 Subject: [PATCH 05/28] style(runs): NumPy-style docstrings + frozen dataclasses Aligns the new sampleworks.runs module with project style policy (AGENTS.md L7 and L338): - Add NumPy-style docstrings (summary, Parameters, Returns, Raises) to every public and private function/class across schema.py, loader.py, runner.py, and cli.py. - Mark Job, Preset, JobInvocation, and _RunningJob as @dataclass(frozen=True). Behavior is preserved: Preset.effective_args already returns a fresh dict, and the runner mutates that local copy rather than the Preset itself. All 32 unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sampleworks/runs/cli.py | 65 +++++++++ src/sampleworks/runs/loader.py | 250 +++++++++++++++++++++++++++++++-- src/sampleworks/runs/runner.py | 167 +++++++++++++++++++++- src/sampleworks/runs/schema.py | 89 +++++++++++- 4 files changed, 553 insertions(+), 18 deletions(-) diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py index 75b3ab2e..7cb64f93 100644 --- a/src/sampleworks/runs/cli.py +++ b/src/sampleworks/runs/cli.py @@ -12,6 +12,20 @@ def main(argv: list[str] | None = None) -> int: + """Entry point for the ``sampleworks-runs`` console script. + + Parameters + ---------- + argv : list of str or None, optional + Command-line arguments excluding the program name. When ``None`` + (the default), :mod:`argparse` reads from :data:`sys.argv`. + + Returns + ------- + int + Exit code suitable for ``sys.exit``: ``0`` on success, non-zero on + job failure or fatal CLI error. + """ parser = _build_parser() args = parser.parse_args(argv) @@ -36,6 +50,13 @@ def main(argv: list[str] | None = None) -> int: def _build_parser() -> argparse.ArgumentParser: + """Construct the :mod:`argparse` parser for ``sampleworks-runs``. + + Returns + ------- + argparse.ArgumentParser + Parser covering preset selection, overrides, and execution flags. + """ parser = argparse.ArgumentParser( prog="sampleworks-runs", description=( @@ -78,6 +99,26 @@ def _build_parser() -> argparse.ArgumentParser: def _filter_only(preset: Preset, only: str) -> Preset: + """Return a new :class:`Preset` containing only the named jobs. + + Parameters + ---------- + preset : Preset + Source preset. + only : str + Comma-separated list of job names to keep. + + Returns + ------- + Preset + New preset with the same ``description``, ``defaults``, and + ``shared_args`` and only the filtered jobs. + + Raises + ------ + SystemExit + If any name in ``only`` does not match a job in ``preset``. + """ names = [n.strip() for n in only.split(",") if n.strip()] keep = [j for j in preset.jobs if j.name in names] missing = set(names) - {j.name for j in keep} @@ -93,6 +134,13 @@ def _filter_only(preset: Preset, only: str) -> Preset: def _print_show(preset: Preset) -> None: + """Print a human-readable rendering of a resolved preset to stdout. + + Parameters + ---------- + preset : Preset + Resolved preset to display (used by ``--show``). + """ print(f"name: {preset.name}") if preset.description: print(f"description: {preset.description}") @@ -112,6 +160,23 @@ def _print_show(preset: Preset) -> None: def _default_results_dir(preset: Preset) -> str: + """Pick a sensible default ``--results-dir`` when none is given. + + Order of preference: + 1. The preset's ``[defaults]`` ``RESULTS_DIR``. + 2. The ``RESULTS_DIR`` environment variable. + 3. ``./grid_search_results``. + + Parameters + ---------- + preset : Preset + Resolved preset (its ``defaults`` have already been merged with env). + + Returns + ------- + str + Path to use as the run's root output directory. + """ return ( preset.defaults.get("RESULTS_DIR") or os.environ.get("RESULTS_DIR") diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py index 65be05ac..ce8b130c 100644 --- a/src/sampleworks/runs/loader.py +++ b/src/sampleworks/runs/loader.py @@ -21,16 +21,50 @@ _BUNDLED_PRESETS_PACKAGE = "sampleworks.runs.presets" _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") +_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"}) def list_bundled_presets() -> list[str]: - """Return the names (sans ``.toml``) of bundled presets, sorted.""" + """List the names of all TOML presets shipped with the package. + + Returns + ------- + list of str + Preset names (filename stems, no ``.toml`` extension), sorted + alphabetically. + """ files = resources.files(_BUNDLED_PRESETS_PACKAGE) return sorted(p.name.removesuffix(".toml") for p in files.iterdir() if p.name.endswith(".toml")) def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset: - """Load a preset by bundled name or filesystem path, applying ``--set`` overrides.""" + """Load a preset by bundled name or filesystem path. + + Parameters + ---------- + name_or_path : str + Either the name of a bundled preset (as returned by + :func:`list_bundled_presets`) or a path ending in ``.toml``. + overrides : Iterable of str, optional + ``KEY=VALUE`` strings as accepted by ``--set``. Applied before + variable interpolation. + + Returns + ------- + Preset + Fully resolved preset ready for :func:`runner.run`. + + Raises + ------ + FileNotFoundError + If ``name_or_path`` matches no bundled preset and no file on disk. + KeyError + If an override path begins with an unknown top-level key, or if a + ``${VAR}`` reference cannot be resolved against the environment or + the ``[defaults]`` block. + ValueError + If an override is malformed (missing ``=``). + """ raw = _read_toml(name_or_path) overrides_list = list(overrides) raw = _apply_overrides(raw, overrides_list) @@ -39,6 +73,23 @@ def load_preset(name_or_path: str, *, overrides: Iterable[str] = ()) -> Preset: def _read_toml(name_or_path: str) -> dict[str, Any]: + """Read raw TOML from a filesystem path or a bundled package resource. + + Parameters + ---------- + name_or_path : str + Bundled preset name or filesystem path ending in ``.toml``. + + Returns + ------- + dict of str to Any + Parsed TOML, before override application or interpolation. + + Raises + ------ + FileNotFoundError + If neither location yields a TOML file. + """ path = Path(name_or_path) if path.suffix == ".toml" and path.exists(): return tomllib.loads(path.read_text()) @@ -52,10 +103,44 @@ def _read_toml(name_or_path: str) -> dict[str, Any]: def _preset_name(name_or_path: str) -> str: + """Return the canonical preset name for a bundled name or path argument. + + Parameters + ---------- + name_or_path : str + Either a bundled name or a path ending in ``.toml``. + + Returns + ------- + str + Filename stem if ``name_or_path`` looks like a path; otherwise the + argument unchanged. + """ return Path(name_or_path).stem if name_or_path.endswith(".toml") else name_or_path def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any]: + """Apply each ``KEY=VALUE`` override to the raw preset dict in place. + + Parameters + ---------- + raw : dict of str to Any + Parsed TOML to mutate. + overrides : list of str + Each entry must contain exactly one ``=``. + + Returns + ------- + dict of str to Any + The same ``raw`` dict (mutated). + + Raises + ------ + ValueError + If an override is missing the ``=`` separator. + KeyError + If an override's top-level key is unknown. + """ for spec in overrides: if "=" not in spec: raise ValueError(f"--set expects KEY=VALUE, got {spec!r}") @@ -64,11 +149,29 @@ def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any return raw -_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"}) - - def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None: - """Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``.""" + """Set a nested value in ``obj`` addressed by a dotted path. + + Job-list elements can be addressed by job name or by integer index. + + Parameters + ---------- + obj : dict of str to Any + Root dict to mutate. + dotted : str + Dotted path, e.g. ``"jobs.rf3.args.gradient-weights"`` or + ``"defaults.DATA_DIR"``. + value : Any + Coerced value to write at the leaf. + + Raises + ------ + KeyError + If the first segment is not one of :data:`_TOP_LEVEL_KEYS`, or if a + list segment references a missing job name or index. + TypeError + If the path attempts to descend through a non-container value. + """ parts = dotted.split(".") if parts[0] not in _TOP_LEVEL_KEYS: raise KeyError( @@ -87,6 +190,27 @@ def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None: def _index(cursor: Any, part: str, *, where: str) -> Any: + """Descend one level into a dict or list, auto-creating empty intermediates. + + Parameters + ---------- + cursor : Any + Current node in the traversal. + part : str + Next segment of the dotted path. + where : str + Path so far, used in error messages. + + Returns + ------- + Any + The child node. + + Raises + ------ + TypeError + If ``cursor`` is neither a dict nor a list. + """ if isinstance(cursor, list): return cursor[_find_in_list(cursor, part, where=where)] if isinstance(cursor, dict): @@ -97,6 +221,28 @@ def _index(cursor: Any, part: str, *, where: str) -> Any: def _find_in_list(items: list[Any], key: str, *, where: str) -> int: + """Locate a list element by integer index or by ``name`` field. + + Parameters + ---------- + items : list of Any + List to search. + key : str + Numeric string (positive or negative index) or a name to match against + each element's ``"name"`` key. + where : str + Path so far, used in error messages. + + Returns + ------- + int + Index of the matching element. + + Raises + ------ + KeyError + If no element with the given name exists. + """ if key.isdigit() or (key.startswith("-") and key[1:].isdigit()): return int(key) for i, item in enumerate(items): @@ -106,6 +252,19 @@ def _find_in_list(items: list[Any], key: str, *, where: str) -> int: def _coerce(value: str) -> Any: + """Convert a string CLI override value to bool, int, float, or leave as str. + + Parameters + ---------- + value : str + Right-hand side of ``KEY=VALUE``. + + Returns + ------- + Any + ``True``/``False`` for ``"true"``/``"false"`` (case-insensitive); + ``int`` or ``float`` if parseable; otherwise the original string. + """ if value.lower() in ("true", "false"): return value.lower() == "true" try: @@ -120,10 +279,27 @@ def _coerce(value: str) -> Any: def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]: - """Expand ``${VAR}`` in every string. Env wins; defaults block fills gaps. - - Defaults are resolved in TOML order, so later defaults can reference earlier ones - (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``). + """Expand ``${VAR}`` references throughout the raw preset. + + Defaults are resolved in TOML order, so later defaults can reference + earlier ones (e.g. ``PROTEINS_CSV = "${DATA_DIR}/proteins.csv"``). Process + environment variables take precedence over the ``[defaults]`` block. + + Parameters + ---------- + raw : dict of str to Any + Parsed TOML, after override application. + + Returns + ------- + dict of str to Any + New dict with all string values fully expanded and ``defaults`` + replaced with the resolved values. + + Raises + ------ + KeyError + If any ``${VAR}`` cannot be resolved. """ defaults: dict[str, str] = dict(raw.get("defaults", {})) accumulated: dict[str, str] = dict(os.environ) @@ -140,6 +316,20 @@ def _resolve_variables(raw: dict[str, Any]) -> dict[str, Any]: def _walk(obj: Any, env: dict[str, str]) -> Any: + """Recursively expand ``${VAR}`` in every string within ``obj``. + + Parameters + ---------- + obj : Any + Arbitrary nested dict/list/scalar. + env : dict of str to str + Resolved variable map. + + Returns + ------- + Any + Structurally identical copy with strings expanded. + """ if isinstance(obj, dict): return {k: _walk(v, env) for k, v in obj.items()} if isinstance(obj, list): @@ -150,6 +340,26 @@ def _walk(obj: Any, env: dict[str, str]) -> Any: def _expand(text: str, env: dict[str, str]) -> str: + """Substitute ``${VAR}`` references in ``text`` until a fixed point. + + Parameters + ---------- + text : str + String potentially containing ``${VAR}`` references. + env : dict of str to str + Variable map. + + Returns + ------- + str + Fully expanded string. + + Raises + ------ + KeyError + If a referenced variable is not in ``env``. + """ + def repl(match: re.Match[str]) -> str: var = match.group(1) if var not in env: @@ -165,6 +375,26 @@ def repl(match: re.Match[str]) -> str: def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset: + """Construct a :class:`Preset` from a resolved raw dict. + + Parameters + ---------- + name : str + Preset name (assigned to :attr:`Preset.name`). + raw : dict of str to Any + Resolved TOML. + + Returns + ------- + Preset + Validated preset. + + Raises + ------ + ValueError + If ``raw['jobs']`` is not a list, or if any :class:`Job` / + :class:`Preset` invariant fails (see their docstrings). + """ raw_jobs = raw.get("jobs", []) if not isinstance(raw_jobs, list): raise ValueError(f"Preset {name!r}: 'jobs' must be a list") diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py index 4e273afa..aaf32f30 100644 --- a/src/sampleworks/runs/runner.py +++ b/src/sampleworks/runs/runner.py @@ -18,8 +18,22 @@ GRID_SEARCH_SCRIPT = "/app/run_grid_search.py" -@dataclass +@dataclass(frozen=True) class JobInvocation: + """The fully resolved command to launch for one job. + + Parameters + ---------- + job : Job + Originating :class:`Job` (kept for introspection in logs). + argv : list of str + Subprocess command line (starts with ``pixi run -e python ...``). + env : dict of str to str + Process environment, including ``CUDA_VISIBLE_DEVICES``. + log_path : Path + File to tee stdout+stderr into. + """ + job: Job argv: list[str] env: dict[str, str] @@ -27,7 +41,24 @@ class JobInvocation: def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]: - """Build the subprocess argv + env + log path for every job in the preset.""" + """Build the subprocess invocation for every job in the preset. + + Per-job ``args`` are merged on top of :attr:`Preset.shared_args`, with + ``--output-dir`` auto-injected from ``results_dir / job.output_subdir`` if + not already present. + + Parameters + ---------- + preset : Preset + Resolved preset to launch. + results_dir : Path + Root directory for outputs and per-job log files. + + Returns + ------- + list of JobInvocation + One :class:`JobInvocation` per job, in declaration order. + """ invocations: list[JobInvocation] = [] for job in preset.jobs: args = preset.effective_args(job) @@ -40,6 +71,23 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]: + """Assemble the ``pixi run`` argv list for one job's args dict. + + ``True`` bools become bare flags, ``False``/``None`` are dropped, all other + values are stringified. + + Parameters + ---------- + pixi_env : str + Pixi environment name passed to ``-e``. + args : dict of str to Any + Flag-name to value map (kebab-case keys, no leading ``--``). + + Returns + ------- + list of str + Subprocess argv. + """ argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT] for key, value in args.items(): flag = f"--{key}" @@ -54,7 +102,26 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]: def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: - """Launch every job in parallel; tee output to per-job logs; return 0 iff all succeed.""" + """Launch every job in parallel and wait for completion. + + Stdout+stderr from each job is teed to a per-job log file under + ``results_dir`` and also echoed to the driver's stderr with a ``[job_name]`` + prefix. + + Parameters + ---------- + preset : Preset + Preset to launch. + results_dir : Path + Root directory for outputs and logs. Created if missing. + dry_run : bool, optional + If True, print the resolved commands instead of launching anything. + + Returns + ------- + int + ``0`` if all jobs exited 0 (or ``dry_run`` was set), ``1`` otherwise. + """ results_dir.mkdir(parents=True, exist_ok=True) invocations = build_invocations(preset, results_dir=results_dir) @@ -75,7 +142,14 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: def _terminate_all(jobs: list[_RunningJob]) -> None: - """Terminate any already-launched jobs (used when a later spawn fails).""" + """Terminate any already-launched jobs (used when a later spawn fails). + + Parameters + ---------- + jobs : list of _RunningJob + Jobs whose subprocesses should be SIGTERM'd, waited on, and whose tee + threads should be joined. + """ for j in jobs: if j.proc.poll() is None: j.proc.terminate() @@ -85,6 +159,13 @@ def _terminate_all(jobs: list[_RunningJob]) -> None: def _print_dry_run(inv: JobInvocation) -> None: + """Print the exact command for one job without launching it. + + Parameters + ---------- + inv : JobInvocation + Invocation to print. + """ print(f"# job: {inv.job.name} (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr) print(f"# log: {inv.log_path}", file=sys.stderr) print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}") @@ -92,6 +173,15 @@ def _print_dry_run(inv: JobInvocation) -> None: def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> None: + """Print a banner describing what is about to be launched. + + Parameters + ---------- + preset : Preset + Preset being launched. + invocations : list of JobInvocation + Jobs about to be spawned. + """ bar = "=" * 60 print(bar, file=sys.stderr) print(f"preset: {preset.name}", file=sys.stderr) @@ -105,14 +195,44 @@ def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> N print(bar, file=sys.stderr) -@dataclass +@dataclass(frozen=True) class _RunningJob: + """Internal handle: a spawned subprocess and its log-tee thread. + + Parameters + ---------- + inv : JobInvocation + Originating invocation. + proc : subprocess.Popen + The subprocess (PIPE'd stdout merged with stderr). + tee_thread : threading.Thread + Daemon thread copying ``proc.stdout`` to the log file and to + ``sys.stderr`` with a per-job prefix. + """ + inv: JobInvocation proc: subprocess.Popen[bytes] tee_thread: threading.Thread def _spawn(inv: JobInvocation) -> _RunningJob: + """Start one subprocess and a thread to tee its output. + + Parameters + ---------- + inv : JobInvocation + Invocation to spawn. + + Returns + ------- + _RunningJob + Handle covering the subprocess and the tee thread. + + Raises + ------ + OSError + Propagated if the subprocess fails to start (e.g. binary missing). + """ inv.log_path.parent.mkdir(parents=True, exist_ok=True) log_file = open(inv.log_path, "wb") try: @@ -138,6 +258,18 @@ def _spawn(inv: JobInvocation) -> _RunningJob: def _wait_all(jobs: list[_RunningJob]) -> int: + """Wait for every job to exit and aggregate their exit codes. + + Parameters + ---------- + jobs : list of _RunningJob + Jobs to wait on. + + Returns + ------- + int + ``0`` if all jobs exited 0, ``1`` if any failed. + """ failures = 0 for j in jobs: exit_code = j.proc.wait() @@ -151,6 +283,18 @@ def _wait_all(jobs: list[_RunningJob]) -> int: def _tee(prefix: str, src: Any, dest: Any) -> None: + """Copy bytes from ``src`` to ``dest`` and to stderr with a label. + + Parameters + ---------- + prefix : str + Per-line label prepended to the stderr echo (e.g. job name). + src : file-like + Readable byte stream (typically ``Popen.stdout`` with stderr merged). + dest : file-like + Writable byte stream for the on-disk log file. Closed when ``src`` is + exhausted. + """ for line in iter(src.readline, b""): dest.write(line) dest.flush() @@ -160,8 +304,21 @@ def _tee(prefix: str, src: Any, dest: Any) -> None: def _ts() -> str: + """Return the current local time as a ``YYYY-MM-DD HH:MM:SS`` string.""" return time.strftime("%Y-%m-%d %H:%M:%S") def _shell_join(argv: list[str]) -> str: + """Quote ``argv`` so the result can be pasted into a POSIX shell. + + Parameters + ---------- + argv : list of str + Argument vector. + + Returns + ------- + str + Single shell-quoted command line. + """ return shlex.join(argv) diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py index 2f50a3e9..6e1b768a 100644 --- a/src/sampleworks/runs/schema.py +++ b/src/sampleworks/runs/schema.py @@ -14,8 +14,37 @@ VALID_PIXI_ENVS = ("boltz", "protenix", "rf3") -@dataclass +@dataclass(frozen=True) class Job: + """One parallel `run_grid_search.py` invocation within a preset. + + Parameters + ---------- + name : str + Identifier used for per-job log files and ``--only`` selection. Must be + unique within the parent :class:`Preset`. + env : str + Pixi environment to run the job in. Must be one of + :data:`VALID_PIXI_ENVS`. + gpus : str + Value to set as ``CUDA_VISIBLE_DEVICES`` for the subprocess (e.g. + ``"4"`` or ``"0,1"``). + output_subdir : str + Path appended to the run's ``results_dir`` to form the job's + ``--output-dir`` argument, when one is not given explicitly in ``args``. + args : dict of str to Any, optional + Per-job overrides merged on top of the preset's + :attr:`Preset.shared_args`. Keys are CLI flag names (without the + leading ``--``); bools become bare flags (``True``) or omitted + (``False``). + + Raises + ------ + ValueError + If ``env`` is not in :data:`VALID_PIXI_ENVS`, or if ``gpus`` / + ``output_subdir`` is empty. + """ + name: str env: str gpus: str @@ -23,6 +52,7 @@ class Job: args: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: + """Validate ``env`` and required string fields.""" if self.env not in VALID_PIXI_ENVS: raise ValueError( f"Job {self.name!r}: env must be one of {VALID_PIXI_ENVS}, got {self.env!r}" @@ -33,8 +63,32 @@ def __post_init__(self) -> None: raise ValueError(f"Job {self.name!r}: output_subdir must be non-empty") -@dataclass +@dataclass(frozen=True) class Preset: + """A named bundle of parallel jobs orchestrated as a unit. + + Parameters + ---------- + name : str + Identifier (matches the bundled TOML filename without the ``.toml`` + suffix, or the stem of a user-supplied path). + description : str + Human-readable summary shown by ``--list`` and the launch banner. + defaults : dict of str to str, optional + Default values for ``${VAR}`` interpolation. The process environment + takes precedence; this block only fills in unset keys. + shared_args : dict of str to Any, optional + Args merged into every job's ``args`` before argv is built. Per-job + ``args`` win on collision. + jobs : list of Job + Jobs to launch in parallel. Must be non-empty and have unique names. + + Raises + ------ + ValueError + If ``jobs`` is empty or contains duplicate names. + """ + name: str description: str defaults: dict[str, str] = field(default_factory=dict) @@ -42,6 +96,7 @@ class Preset: jobs: list[Job] = field(default_factory=list) def __post_init__(self) -> None: + """Validate the job list is non-empty and names are unique.""" if not self.jobs: raise ValueError(f"Preset {self.name!r}: must declare at least one job") seen: set[str] = set() @@ -51,11 +106,39 @@ def __post_init__(self) -> None: seen.add(job.name) def job(self, name: str) -> Job: + """Return the :class:`Job` with the given name. + + Parameters + ---------- + name : str + Job name to look up. + + Returns + ------- + Job + The matching job. + + Raises + ------ + KeyError + If no job has the given name. + """ for j in self.jobs: if j.name == name: return j raise KeyError(f"Preset {self.name!r} has no job {name!r}") def effective_args(self, job: Job) -> dict[str, Any]: - """Return ``shared_args`` merged with per-job overrides.""" + """Merge :attr:`shared_args` with a job's per-job overrides. + + Parameters + ---------- + job : Job + Job whose ``args`` override the shared defaults. + + Returns + ------- + dict of str to Any + New dict; mutating it does not affect the preset. + """ return {**self.shared_args, **job.args} From ce82079f6f4cee558f2511b846e51b725d3400db Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Mon, 18 May 2026 23:14:40 -0700 Subject: [PATCH 06/28] fix(runs): point presets at /mnt/diffuse-shared paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match the actual mounted layout on the ACTL pod: /mnt/diffuse-shared/raw/sampleworks/ ├── initial_dataset_40/ # DATA_DIR for all_models ├── initial_dataset_40_occ_sweeps/ # DATA_DIR for occ-sweep presets ├── actl_msa_cache/ # MSA_CACHE_DIR for all presets └── actl_results// # RESULTS_DIR namespaced per preset Per-preset RESULTS_DIR avoids cross-preset collisions when multiple presets share output_subdirs (e.g. all_models and rf3_partial both have a job named "rf3"). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sampleworks/runs/presets/all_models.toml | 6 +++--- src/sampleworks/runs/presets/protenix_dual.toml | 6 +++--- src/sampleworks/runs/presets/rf3_partial.toml | 6 +++--- src/sampleworks/runs/presets/rf3_partial_chiral_off.toml | 6 +++--- src/sampleworks/runs/presets/rf3_protenix.toml | 6 +++--- tests/runs/test_loader.py | 2 +- tests/runs/test_runner.py | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml index 21d1cd92..4ed1bcf7 100644 --- a/src/sampleworks/runs/presets/all_models.toml +++ b/src/sampleworks/runs/presets/all_models.toml @@ -1,9 +1,9 @@ description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)." [defaults] -DATA_DIR = "/data/input" -RESULTS_DIR = "/data/results" -MSA_CACHE_DIR = "${HOME}/.sampleworks/msa" +DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40" +RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/all_models" +MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" [shared_args] diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml index 461d547e..1f4c0e36 100644 --- a/src/sampleworks/runs/presets/protenix_dual.toml +++ b/src/sampleworks/runs/presets/protenix_dual.toml @@ -1,9 +1,9 @@ description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)." [defaults] -DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results" -MSA_CACHE_DIR = "/data/sampleworks-exp/msa_cache" +DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/protenix_dual" +MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt" PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt" diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml index 533accfb..911552e1 100644 --- a/src/sampleworks/runs/presets/rf3_partial.toml +++ b/src/sampleworks/runs/presets/rf3_partial.toml @@ -1,9 +1,9 @@ description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)." [defaults] -DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results" -MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial" +MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml index bd3a1311..2f1e1817 100644 --- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml +++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml @@ -1,9 +1,9 @@ description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep." [defaults] -DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/data/sampleworks-exp/occ_sweep/grid_search_results_rf3_chiral_off" -MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial_chiral_off" +MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml index 9255cfea..32d2eb23 100644 --- a/src/sampleworks/runs/presets/rf3_protenix.toml +++ b/src/sampleworks/runs/presets/rf3_protenix.toml @@ -1,9 +1,9 @@ description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)." [defaults] -DATA_DIR = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "${HOME}/sampleworks-exp/occ_sweep/grid_search_results" -MSA_CACHE_DIR = "${HOME}/sampleworks-exp/msa_cache" +DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" +RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_protenix" +MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" [shared_args] diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py index d8d6509c..13379a41 100644 --- a/tests/runs/test_loader.py +++ b/tests/runs/test_loader.py @@ -40,7 +40,7 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DATA_DIR", raising=False) monkeypatch.setenv("HOME", "/home/test") preset = loader.load_preset("rf3_partial") - expected = "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" + expected = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" assert preset.defaults["DATA_DIR"] == expected diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py index 050d7458..635a8c28 100644 --- a/tests/runs/test_runner.py +++ b/tests/runs/test_runner.py @@ -26,7 +26,7 @@ def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> N assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"] pairs = _argv_to_dict(argv[6:]) assert pairs["--proteins"] == ( - "/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv" + "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv" ) assert pairs["--model"] == "rf3" assert pairs["--scalers"] == "pure_guidance" From 09c367bcccb284164f1980190a97a141cceff055 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Tue, 19 May 2026 11:35:08 -0700 Subject: [PATCH 07/28] fix(runs): align preset paths with actl_setup_sampleworks_paths.sh The proteins.csv files in the shared dataset reference /data/inputs/ paths that only resolve after the pod-init script (actl_setup_sampleworks_paths.sh) creates the canonical symlinks. Point presets at those canonical paths so everything stays consistent: DATA_DIR /data/input | /data/inputs RESULTS_DIR /data/results/ MSA_CACHE_DIR /root/.sampleworks /data/results is namespaced by the pod-init script via $SAMPLEWORKS_ACTL_RUN_NAME (defaults to $HOSTNAME), so per-preset subdirs sit inside a per-session root. README documents the one-time setup script step. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 8 ++++++++ src/sampleworks/runs/presets/all_models.toml | 6 +++--- src/sampleworks/runs/presets/protenix_dual.toml | 6 +++--- src/sampleworks/runs/presets/rf3_partial.toml | 6 +++--- src/sampleworks/runs/presets/rf3_partial_chiral_off.toml | 6 +++--- src/sampleworks/runs/presets/rf3_protenix.toml | 6 +++--- tests/runs/test_loader.py | 3 +-- tests/runs/test_runner.py | 4 +--- 8 files changed, 25 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index efdcc537..bfa49541 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,14 @@ Instructions for running evaluation and metrics scripts are coming soon. For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes. +**Pod-side prerequisite.** Bundled presets reference the canonical `/data/inputs`, `/data/results`, and `/root/.sampleworks` paths set up by the ACTL pod-init script. On a fresh sampleworks pod, run once per session: + +```bash +bash /mnt/diffuse-shared/raw/sampleworks/actl_setup_sampleworks_paths.sh +``` + +That creates symlinks pointing the canonical paths at the shared mount (and namespaces `/data/results` by hostname or `$SAMPLEWORKS_ACTL_RUN_NAME`). Overrides via env var (`DATA_DIR=...`) or CLI (`--set defaults.DATA_DIR=...`) work without the symlinks. + ```bash pixi run -e rf3 sampleworks-runs --list # bundled presets pixi run -e rf3 sampleworks-runs rf3_partial # run a preset diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml index 4ed1bcf7..72701997 100644 --- a/src/sampleworks/runs/presets/all_models.toml +++ b/src/sampleworks/runs/presets/all_models.toml @@ -1,9 +1,9 @@ description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)." [defaults] -DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40" -RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/all_models" -MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" +DATA_DIR = "/data/input" +RESULTS_DIR = "/data/results/all_models" +MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" [shared_args] diff --git a/src/sampleworks/runs/presets/protenix_dual.toml b/src/sampleworks/runs/presets/protenix_dual.toml index 1f4c0e36..231ee220 100644 --- a/src/sampleworks/runs/presets/protenix_dual.toml +++ b/src/sampleworks/runs/presets/protenix_dual.toml @@ -1,9 +1,9 @@ description = "Run Protenix tiny and mini variants in parallel (different checkpoints, same sweep)." [defaults] -DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/protenix_dual" -MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" +DATA_DIR = "/data/inputs" +RESULTS_DIR = "/data/results/protenix_dual" +MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" PROTENIX_TINY_CHECKPOINT = "/extra_checkpoints/protenix_tiny_default_v0.5.0.pt" PROTENIX_MINI_CHECKPOINT = "/extra_checkpoints/protenix_mini_default_v0.5.0.pt" diff --git a/src/sampleworks/runs/presets/rf3_partial.toml b/src/sampleworks/runs/presets/rf3_partial.toml index 911552e1..60a063e1 100644 --- a/src/sampleworks/runs/presets/rf3_partial.toml +++ b/src/sampleworks/runs/presets/rf3_partial.toml @@ -1,9 +1,9 @@ description = "RF3 partial-diffusion canonical occ-sweep on a single GPU (7 gradient weights)." [defaults] -DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial" -MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" +DATA_DIR = "/data/inputs" +RESULTS_DIR = "/data/results/rf3_partial" +MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" diff --git a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml index 2f1e1817..af0e5ac8 100644 --- a/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml +++ b/src/sampleworks/runs/presets/rf3_partial_chiral_off.toml @@ -1,9 +1,9 @@ description = "RF3 occ-sweep with --disable-chiral-features and a wider 10-weight sweep." [defaults] -DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_partial_chiral_off" -MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" +DATA_DIR = "/data/inputs" +RESULTS_DIR = "/data/results/rf3_partial_chiral_off" +MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" RF3_CHECKPOINT = "/checkpoints/rf3_foundry_01_24_latest.ckpt" diff --git a/src/sampleworks/runs/presets/rf3_protenix.toml b/src/sampleworks/runs/presets/rf3_protenix.toml index 32d2eb23..4ca5638d 100644 --- a/src/sampleworks/runs/presets/rf3_protenix.toml +++ b/src/sampleworks/runs/presets/rf3_protenix.toml @@ -1,9 +1,9 @@ description = "RF3 + Protenix combo on the occ-sweep dataset (RF3 on GPUs 0-3, Protenix on 4-7)." [defaults] -DATA_DIR = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_results/rf3_protenix" -MSA_CACHE_DIR = "/mnt/diffuse-shared/raw/sampleworks/actl_msa_cache" +DATA_DIR = "/data/inputs" +RESULTS_DIR = "/data/results/rf3_protenix" +MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" [shared_args] diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py index 13379a41..6528a602 100644 --- a/tests/runs/test_loader.py +++ b/tests/runs/test_loader.py @@ -40,8 +40,7 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DATA_DIR", raising=False) monkeypatch.setenv("HOME", "/home/test") preset = loader.load_preset("rf3_partial") - expected = "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" - assert preset.defaults["DATA_DIR"] == expected + assert preset.defaults["DATA_DIR"] == "/data/inputs" def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None: diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py index 635a8c28..39aefcb2 100644 --- a/tests/runs/test_runner.py +++ b/tests/runs/test_runner.py @@ -25,9 +25,7 @@ def test_argv_for_rf3_partial_matches_bash(monkeypatch: pytest.MonkeyPatch) -> N argv = inv.argv assert argv[:6] == ["pixi", "run", "-e", "rf3", "python", "/app/run_grid_search.py"] pairs = _argv_to_dict(argv[6:]) - assert pairs["--proteins"] == ( - "/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps/proteins.csv" - ) + assert pairs["--proteins"] == "/data/inputs/proteins.csv" assert pairs["--model"] == "rf3" assert pairs["--scalers"] == "pure_guidance" assert pairs["--partial-diffusion-step"] == "120" From ae77784ceeb10c1408b1d051ba0b24175ae09f85 Mon Sep 17 00:00:00 2001 From: Magomed Abdurakhmanov Date: Tue, 19 May 2026 11:39:06 -0700 Subject: [PATCH 08/28] fix(runs): mkdir the per-job --output-dir before launching run_grid_search.py writes its work-queue pickle (wjq_*.pkl) directly into --output-dir without creating it first, so the orchestrator must. Previously only results_dir itself was mkdir'd, which broke any preset whose output_subdir didn't already exist on disk. JobInvocation now carries the resolved output_dir explicitly so _spawn can mkdir it alongside the log directory. --dry-run still doesn't touch the filesystem. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sampleworks/runs/runner.py | 10 +++++++++- tests/runs/test_runner.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py index aaf32f30..6a5c6fbb 100644 --- a/src/sampleworks/runs/runner.py +++ b/src/sampleworks/runs/runner.py @@ -32,12 +32,16 @@ class JobInvocation: Process environment, including ``CUDA_VISIBLE_DEVICES``. log_path : Path File to tee stdout+stderr into. + output_dir : Path + Resolved ``--output-dir`` value (mkdir'd by the runner before launch + because ``run_grid_search.py`` assumes its existence). """ job: Job argv: list[str] env: dict[str, str] log_path: Path + output_dir: Path def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocation]: @@ -66,7 +70,10 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio argv = _build_argv(job.env, args) env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus} log_path = results_dir / f"{job.name}_run.log" - invocations.append(JobInvocation(job=job, argv=argv, env=env, log_path=log_path)) + output_dir = Path(args["output-dir"]) + invocations.append( + JobInvocation(job=job, argv=argv, env=env, log_path=log_path, output_dir=output_dir) + ) return invocations @@ -234,6 +241,7 @@ def _spawn(inv: JobInvocation) -> _RunningJob: Propagated if the subprocess fails to start (e.g. binary missing). """ inv.log_path.parent.mkdir(parents=True, exist_ok=True) + inv.output_dir.mkdir(parents=True, exist_ok=True) log_file = open(inv.log_path, "wb") try: proc = subprocess.Popen( diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py index 39aefcb2..61c5b758 100644 --- a/tests/runs/test_runner.py +++ b/tests/runs/test_runner.py @@ -92,6 +92,27 @@ def test_rf3_partial_chiral_off_flag_present(monkeypatch: pytest.MonkeyPatch) -> assert "--force-all" in inv.argv +def test_build_invocations_records_output_dir(monkeypatch: pytest.MonkeyPatch) -> None: + """`run_grid_search.py` assumes its --output-dir exists; the runner must mkdir it.""" + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("rf3_partial") + inv = runner.build_invocations(preset, results_dir=Path("/r"))[0] + assert inv.output_dir == Path("/r/rf3") + + +def test_dry_run_does_not_create_directories( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """--dry-run prints commands but never touches the filesystem.""" + monkeypatch.setenv("HOME", str(tmp_path)) + results_dir = tmp_path / "results" + preset = loader.load_preset("rf3_partial") + runner.run(preset, results_dir=results_dir, dry_run=True) + # results_dir gets created by run() (for log file location) but per-job + # output subdirs must NOT exist after dry-run. + assert not (results_dir / "rf3").exists() + + def _argv_to_dict(tail: list[str]) -> dict[str, object]: """Turn ``[--a, 1, --b, --c, 2]`` into ``{'--a': '1', '--b': True, '--c': '2'}``.""" out: dict[str, object] = {} From 2da1e2dffb9ab646a8189b189aed03b9693222b0 Mon Sep 17 00:00:00 2001 From: xraymemory Date: Thu, 21 May 2026 12:15:01 -0400 Subject: [PATCH 09/28] fix(runs): make ACTL sampleworks image self-contained --- .actlignore | 13 + Dockerfile | 14 +- README.md | 47 ++- run_all_models.sh | 307 ++++++++---------- run_grid_search.py | 78 ++++- src/sampleworks/models/boltz/wrapper.py | 15 +- src/sampleworks/runs/presets/all_models.toml | 2 +- src/sampleworks/runs/runner.py | 147 ++++++++- src/sampleworks/runs/schema.py | 2 +- .../utils/guidance_script_arguments.py | 35 +- .../utils/guidance_script_utils.py | 8 +- tests/runs/conftest.py | 11 + tests/runs/test_loader.py | 9 + tests/runs/test_runner.py | 34 ++ 14 files changed, 517 insertions(+), 205 deletions(-) create mode 100644 .actlignore create mode 100644 tests/runs/conftest.py diff --git a/.actlignore b/.actlignore new file mode 100644 index 00000000..047b4971 --- /dev/null +++ b/.actlignore @@ -0,0 +1,13 @@ +# Keep ACTL sync focused on source. Large data/results should live under +# /mnt/diffuse-shared or the pod home PVC, not in the synced checkout. +.pixi/ +grid_search_results/ +outputs/ +data/ +initial_dataset_40*/ +checkpoints/ +release_data/ +*.ckpt +*.pt +*.tar.gz +*.tgz diff --git a/Dockerfile b/Dockerfile index a51f8651..c7ed94d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,8 +7,8 @@ # Build: # docker build -t sampleworks . # -# CI builds pull checkpoints automatically from Docker Hub via: -# COPY --from=diffuseproject/sampleworks-checkpoints:latest +# CI builds pull checkpoints automatically from Harbor via: +# COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest # No checkpoint files are needed in the build context or on the CI runner. # # To rebuild the checkpoints base image (only needed when checkpoints change): @@ -56,7 +56,7 @@ # /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model (~1.4GB) # # Checkpoints base image: -# All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub. +# All checkpoints live in harbor.astera.sh/library/sampleworks-checkpoints:latest. # To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server. # ============================================================================ @@ -108,7 +108,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh # ============================================================================ # Checkpoints (~10 GB) rarely change, so this layer is placed before pixi # installs to stay cached even when dependencies update. -COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ +COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/ # ============================================================================ # Install all three environments: boltz, protenix, rf3 @@ -129,6 +129,12 @@ RUN pixi run -e boltz python -c "\ from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)" +COPY run_all_models.sh ./ +RUN chmod +x /app/run_all_models.sh \ + && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \ + && chmod +x /usr/local/bin/run_all_models.sh \ + && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n cd /app\nfi\n' >> /root/.bashrc + # Set default checkpoint paths via environment variables ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \ BOLTZ2_CHECKPOINT=/checkpoints/boltz2_conf.ckpt \ diff --git a/README.md b/README.md index bfa49541..dade48e7 100644 --- a/README.md +++ b/README.md @@ -152,35 +152,50 @@ Output layout: `grid_search_results//[_]//ens Instructions for running evaluation and metrics scripts are coming soon. -## Preset experiments (`sampleworks-runs`) +## ACTL preset experiments (`run_all_models.sh` / `sampleworks-runs`) -For canonical multi-model/multi-GPU sweeps, the `sampleworks-runs` CLI orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares its jobs (model, pixi env, GPU assignment, args); the runner launches them in parallel, tees per-job logs, and aggregates exit codes. +For canonical multi-model/multi-GPU sweeps, `sampleworks-runs` orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares the model, pixi env, GPU assignment, output subdir, and CLI args. The runner launches jobs in parallel, tees per-job logs, and aggregates exit codes. -**Pod-side prerequisite.** Bundled presets reference the canonical `/data/inputs`, `/data/results`, and `/root/.sampleworks` paths set up by the ACTL pod-init script. On a fresh sampleworks pod, run once per session: +On ACTL, start the pod with the prebuilt image and shared storage, then run one command inside the pod shell: ```bash -bash /mnt/diffuse-shared/raw/sampleworks/actl_setup_sampleworks_paths.sh +actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes + +# inside the ACTL pod shell +# the sampleworks image drops interactive shells in /app +run_all_models.sh --dry-run # inspect commands first +run_all_models.sh # run /app/src/sampleworks/runs/presets/all_models.toml ``` -That creates symlinks pointing the canonical paths at the shared mount (and namespaces `/data/results` by hostname or `$SAMPLEWORKS_ACTL_RUN_NAME`). Overrides via env var (`DATA_DIR=...`) or CLI (`--set defaults.DATA_DIR=...`) work without the symlinks. +The wrapper keeps the TOML preset as the source of truth. It only supplies ACTL-friendly defaults: + +- `DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps` +- `RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks//all_models` +- `MSA_CACHE_DIR=/mnt/diffuse-shared/cache/sampleworks/msa` +- `PYTHONPATH=/app/src`, using the copy baked into the sampleworks image +- direct `/app/.pixi/envs//bin/python` execution, so it reuses the environments baked into the sampleworks image without refreshing pixi caches +- `/tmp` pixi/uv caches for any missing environment preparation, avoiding shared-storage Git cache issues + +Common commands: ```bash -pixi run -e rf3 sampleworks-runs --list # bundled presets -pixi run -e rf3 sampleworks-runs rf3_partial # run a preset -pixi run -e rf3 sampleworks-runs rf3_partial --show # inspect resolved values -pixi run -e rf3 sampleworks-runs rf3_partial --dry-run # print pixi run commands, don't execute -pixi run -e rf3 sampleworks-runs all_models --only rf3,protenix # subset jobs - -# Override any value without editing the TOML: -pixi run -e rf3 sampleworks-runs rf3_partial \ - --set jobs.rf3.gpus=7 \ +run_all_models.sh --list # bundled presets +run_all_models.sh all_models --show # inspect resolved values +run_all_models.sh all_models --only rf3,protenix # subset jobs +run_all_models.sh rf3_partial # run a smaller preset + +# Override paths or parameters without editing TOML: +DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/my_dataset run_all_models.sh rf3_partial +run_all_models.sh rf3_partial \ + --set jobs.rf3.gpus=0 \ --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02" ``` -Bundled presets live in `src/sampleworks/runs/presets/*.toml`. Add a new preset by dropping a `.toml` file alongside them or pointing at any path: +Bundled presets live in `src/sampleworks/runs/presets/*.toml`. You can also copy one, edit it, and run it by path: ```bash -sampleworks-runs ./my_experiment.toml +cp src/sampleworks/runs/presets/all_models.toml my_experiment.toml +run_all_models.sh ./my_experiment.toml ``` Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block. diff --git a/run_all_models.sh b/run_all_models.sh index 5b90f81f..96b032b0 100755 --- a/run_all_models.sh +++ b/run_all_models.sh @@ -1,164 +1,145 @@ -#!/bin/bash -# Run all 4 model grid searches in parallel, 2 GPUs each -# Total: 8 GPUs used (4 jobs x 2 GPUs each) +#!/usr/bin/env bash +# ACTL-native entry point for Sampleworks preset runs. # -# Models: -# - Boltz2 X-ray diffraction (GPUs 0,1) -# - Boltz2 MD (GPUs 2,3) -# - RosettaFold3 (GPUs 4,5) -# - Protenix (GPUs 6,7) -# -# Checkpoints are BAKED INTO the Docker image at /checkpoints/. -# If missing, the code auto-falls back to mounted paths. -# -# Usage: -# ./run_all_models.sh - -set -e - -# Configuration -DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}" -MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}" - -# Create directories -mkdir -p "$RESULTS_DIR" -mkdir -p "$MSA_CACHE_DIR" - -# Pull latest image (no-op if already up to date) -echo "Pulling latest Docker image..." -docker pull diffuseproject/sampleworks:latest - -# Common docker options -DOCKER_OPTS="--rm --shm-size=16g" - -echo "==========================================" -echo "Starting all model grid searches (4 jobs x 2 GPUs)" -echo "Data: $DATA_DIR" -echo "Results: $RESULTS_DIR" -echo "MSA Cache: $MSA_CACHE_DIR" -echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)" -echo "" -echo "Models:" -echo " - Boltz2 X-ray (GPUs 0,1)" -echo " - Boltz2 MD (GPUs 2,3)" -echo " - RF3 (GPUs 4,5)" -echo " - Protenix (GPUs 6,7)" -echo "==========================================" - -PIDS=() - -# --- Boltz2 X-ray Diffraction (GPUs 0,1) --- -echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1" -docker run $DOCKER_OPTS \ - --gpus '"device=0,1"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \ - -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \ - diffuseproject/sampleworks:latest \ - -e boltz run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --model boltz2 \ - --method "X-RAY DIFFRACTION" \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" & -PIDS+=($!) -echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})" - -# --- Boltz2 MD (GPUs 2,3) --- -echo "[$(date)] Starting Boltz2 MD on GPUs 2,3" -docker run $DOCKER_OPTS \ - --gpus '"device=2,3"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \ - -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \ - diffuseproject/sampleworks:latest \ - -e boltz run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --model boltz2 \ - --method "MD" \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" & -PIDS+=($!) -echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})" - -# --- RosettaFold3 (GPUs 4,5) --- -echo "[$(date)] Starting RosettaFold3 on GPUs 4,5" -docker run $DOCKER_OPTS \ - --gpus '"device=4,5"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \ - -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \ - diffuseproject/sampleworks:latest \ - -e rf3 run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --model rf3 \ - --partial-diffusion-step 120 \ - --scalers pure_guidance \ - --ensemble-sizes "8" \ - --gradient-weights "0.01 0.02 0.05" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/rf3_run.log" & -PIDS+=($!) -echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})" - -# --- Protenix (GPUs 6,7) --- -echo "[$(date)] Starting Protenix on GPUs 6,7" -docker run $DOCKER_OPTS \ - --gpus '"device=6,7"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - -e SAMPLEWORKS_HOST_INPUT_DIR="$DATA_DIR" \ - -e SAMPLEWORKS_HOST_RESULTS_DIR="$RESULTS_DIR" \ - diffuseproject/sampleworks:latest \ - -e protenix run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --model protenix \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/protenix_run.log" & -PIDS+=($!) -echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})" - -echo "" -echo "==========================================" -echo "All 4 jobs launched! PIDs: ${PIDS[*]}" -echo "Logs:" -echo " - $RESULTS_DIR/boltz2_xrd_run.log" -echo " - $RESULTS_DIR/boltz2_md_run.log" -echo " - $RESULTS_DIR/rf3_run.log" -echo " - $RESULTS_DIR/protenix_run.log" -echo "" -echo "Monitor GPU usage: nvidia-smi -l 1" -echo "Waiting for all jobs to complete..." -echo "==========================================" - -# Wait for all background jobs -wait - -echo "" -echo "==========================================" -echo "[$(date)] All jobs completed!" -echo "==========================================" +# The TOML preset is the source of truth. This wrapper only supplies smooth +# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH, +# and direct use of the prebuilt pixi environments from the image at /app. + +set -euo pipefail + +script_path="${BASH_SOURCE[0]}" +while [[ -L "$script_path" ]]; do + script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)" + script_target="$(readlink "$script_path")" + if [[ "$script_target" == /* ]]; then + script_path="$script_target" + else + script_path="$script_dir/$script_target" + fi +done +script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)" +repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}" + +preset="${SAMPLEWORKS_PRESET:-all_models}" +if [[ $# -gt 0 && "$1" != -* ]]; then + preset="$1" + shift +fi + +if [[ "$preset" == *.toml || "$preset" == */* ]]; then + if [[ "$preset" != /* ]]; then + preset="$repo_root/$preset" + fi +fi +preset_label="${preset##*/}" +preset_label="${preset_label%.toml}" + +run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}" +default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" +default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}" +default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa" + +export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}" +export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}" +export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}" +export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}" +export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}" +export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}" +export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}" + +shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints" +for checkpoint_var_and_file in \ + "BOLTZ1_CHECKPOINT boltz1_conf.ckpt" \ + "BOLTZ2_CHECKPOINT boltz2_conf.ckpt" \ + "RF3_CHECKPOINT rf3_foundry_01_24_latest.ckpt" \ + "PROTENIX_CHECKPOINT protenix_base_default_v0.5.0.pt"; do + read -r checkpoint_var checkpoint_file <<<"$checkpoint_var_and_file" + checkpoint_path="$shared_checkpoint_dir/$checkpoint_file" + if [[ -z "${!checkpoint_var:-}" && -f "$checkpoint_path" ]]; then + export "$checkpoint_var=$checkpoint_path" + fi +done + +source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}" +if [[ -f "$source_proteins_csv" ]]; then + # The shared proteins.csv currently contains absolute /data/inputs paths, + # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run + # manifest instead of requiring non-root scientists to create /data symlinks. + manifest_dir="$RESULTS_DIR/_input_manifest" + manifest_proteins_csv="$manifest_dir/proteins.csv" + mkdir -p "$manifest_dir" + legacy_data_dir="/data/inputs" + while IFS= read -r line || [[ -n "$line" ]]; do + printf '%s\n' "${line//$legacy_data_dir/$DATA_DIR}" + done <"$source_proteins_csv" >"$manifest_proteins_csv" + export PROTEINS_CSV="$manifest_proteins_csv" +fi + +runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}" +pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}" +if [[ -z "$pixi_project_dir" ]]; then + if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then + pixi_project_dir="/app" + else + pixi_project_dir="$repo_root" + fi +fi +runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}" + +needs_runtime_paths=1 +for arg in "$@"; do + case "$arg" in + --dry-run|--show|--list|-h|--help) + needs_runtime_paths=0 + ;; + esac +done + +if [[ "$needs_runtime_paths" -eq 1 ]]; then + if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then + cat >&2 < ./run_all_models.sh + +EOF + exit 2 + fi + mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR" +fi + +cat >&2 < dict[str, str]: + """Return process environment values for a direct pixi Python executable. + + Parameters + ---------- + env_python : str + Python executable under ``.pixi/envs//bin/python``. + + Returns + ------- + dict of str to str + Environment with the env's ``bin`` directory, ``CONDA_PREFIX``, and + ``CUDA_HOME`` set so compiled extensions can find tools such as + ``ninja`` and the CUDA toolkit without going through ``pixi run``. + """ + env_dir = Path(env_python).resolve().parent.parent + bin_dir = env_dir / "bin" + env = os.environ.copy() + env["PATH"] = f"{bin_dir}{os.pathsep}{env.get('PATH', '')}" + env["CONDA_PREFIX"] = str(env_dir) + env.setdefault("CUDA_HOME", str(env_dir)) + env["PYTHONNOUSERSITE"] = "1" + return env + + +def get_pixi_env_python(pixi_env: str) -> str | None: + """Return a direct Python binary for a preinstalled pixi environment. + + The ACTL sampleworks image bakes environments under ``/app/.pixi``. Using + those interpreters directly avoids a runtime ``pixi run`` cache refresh on + shared storage. Set ``SAMPLEWORKS_FORCE_PIXI=1`` to force the old behavior. + + Parameters + ---------- + pixi_env : str + Pixi environment name such as ``boltz``, ``protenix``, or ``rf3``. + + Returns + ------- + str or None + Path to the environment's Python executable, or ``None`` to use pixi. + """ + if os.environ.get("SAMPLEWORKS_FORCE_PIXI", "").lower() in {"1", "true", "yes"}: + return None + + env_key = pixi_env.upper().replace("-", "_") + override = os.environ.get(f"SAMPLEWORKS_{env_key}_PYTHON") + if override: + return override + + pixi_project_dir = Path(os.environ.get("SAMPLEWORKS_PIXI_PROJECT_DIR", "/app")) + candidate = pixi_project_dir / ".pixi" / "envs" / pixi_env / "bin" / "python" + if candidate.is_file() and os.access(candidate, os.X_OK): + return str(candidate) + return None + + def main(args: argparse.Namespace): """ Main pipeline for running grid search trials. diff --git a/src/sampleworks/models/boltz/wrapper.py b/src/sampleworks/models/boltz/wrapper.py index c257511e..4efbaecc 100644 --- a/src/sampleworks/models/boltz/wrapper.py +++ b/src/sampleworks/models/boltz/wrapper.py @@ -320,7 +320,7 @@ class BoltzConfig: """ out_dir: str | Path | None = None - num_workers: int = 8 + num_workers: int = 0 ensemble_size: int = 1 recycling_steps: int = 3 @@ -329,7 +329,7 @@ def process_structure_for_boltz( structure: dict, *, out_dir: str | Path | None = None, - num_workers: int = 8, + num_workers: int = 0, ensemble_size: int = 1, recycling_steps: int | None = 3, ) -> dict: @@ -360,6 +360,9 @@ def process_structure_for_boltz( if recycling_steps is None: recycling_steps = 3 + # Keep Boltz dataloading in-process by default. Kubernetes pods usually get + # a small /dev/shm, and torch DataLoader workers can exhaust it while + # sharing large featurized batches back to the parent process. config = BoltzConfig( out_dir=out_dir or structure.get("metadata", {}).get("id", "boltz_output"), num_workers=num_workers, @@ -567,7 +570,7 @@ def _setup_data_module( self, input_path: str | Path, out_dir: str | Path, - num_workers: int = 8, + num_workers: int = 0, ): """Create the Lightning data module used by Boltz to serve data to the model. @@ -628,7 +631,7 @@ def _setup_data_module( target_dir=processed.targets_dir, msa_dir=processed.msa_dir, mol_dir=mol_dir, - num_workers=num_workers if num_workers is not None else 8, + num_workers=num_workers if num_workers is not None else 0, constraints_dir=processed.constraints_dir, template_dir=processed_dir / "templates" if (processed_dir / "templates").exists() @@ -1032,7 +1035,7 @@ def _setup_data_module( self, input_path: str | Path, out_dir: str | Path, - num_workers: int = 2, + num_workers: int = 0, ): """Create the Lightning data module used by Boltz to serve data to the model. @@ -1090,7 +1093,7 @@ def _setup_data_module( manifest=processed.manifest, target_dir=processed.targets_dir, msa_dir=processed.msa_dir, - num_workers=num_workers if num_workers is not None else 2, + num_workers=num_workers if num_workers is not None else 0, constraints_dir=processed.constraints_dir, ) diff --git a/src/sampleworks/runs/presets/all_models.toml b/src/sampleworks/runs/presets/all_models.toml index 72701997..a00461fb 100644 --- a/src/sampleworks/runs/presets/all_models.toml +++ b/src/sampleworks/runs/presets/all_models.toml @@ -1,7 +1,7 @@ description = "Run all 4 model grid searches in parallel across 8 GPUs (boltz2 X-ray, boltz2 MD, RF3, Protenix)." [defaults] -DATA_DIR = "/data/input" +DATA_DIR = "/data/inputs" RESULTS_DIR = "/data/results/all_models" MSA_CACHE_DIR = "/root/.sampleworks" PROTEINS_CSV = "${DATA_DIR}/proteins.csv" diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py index 6a5c6fbb..72ca9bbf 100644 --- a/src/sampleworks/runs/runner.py +++ b/src/sampleworks/runs/runner.py @@ -15,7 +15,7 @@ from .schema import Job, Preset -GRID_SEARCH_SCRIPT = "/app/run_grid_search.py" +DEFAULT_GRID_SEARCH_SCRIPT = "/app/run_grid_search.py" @dataclass(frozen=True) @@ -68,7 +68,7 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio args = preset.effective_args(job) args.setdefault("output-dir", str(results_dir / job.output_subdir)) argv = _build_argv(job.env, args) - env = {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus} + env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus}) log_path = results_dir / f"{job.name}_run.log" output_dir = Path(args["output-dir"]) invocations.append( @@ -95,7 +95,11 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]: list of str Subprocess argv. """ - argv = ["pixi", "run", "-e", pixi_env, "python", GRID_SEARCH_SCRIPT] + env_python = _pixi_env_python(pixi_env) + if env_python: + argv = [env_python, _grid_search_script()] + else: + argv = ["pixi", "run", "-e", pixi_env, "python", _grid_search_script()] for key, value in args.items(): flag = f"--{key}" if isinstance(value, bool): @@ -108,6 +112,101 @@ def _build_argv(pixi_env: str, args: dict[str, Any]) -> list[str]: return argv +def _pixi_env_python(pixi_env: str) -> str | None: + """Return the direct Python binary for a baked pixi environment when available. + + The sampleworks ACTL image already contains fully-installed environments at + ``/app/.pixi/envs/``. Calling those Python binaries directly avoids + ``pixi run`` trying to refresh Git/PyPI caches on shared pod storage. + + Parameters + ---------- + pixi_env : str + Pixi environment name from the preset job. + + Returns + ------- + str or None + Executable Python path, or ``None`` to fall back to ``pixi run``. + """ + if os.environ.get("SAMPLEWORKS_FORCE_PIXI", "").lower() in {"1", "true", "yes"}: + return None + + env_key = pixi_env.upper().replace("-", "_") + override = os.environ.get(f"SAMPLEWORKS_{env_key}_PYTHON") + if override: + return override + + candidate = _pixi_project_dir() / ".pixi" / "envs" / pixi_env / "bin" / "python" + if candidate.is_file() and os.access(candidate, os.X_OK): + return str(candidate) + return None + + +def _job_env(pixi_env: str, env: dict[str, str]) -> dict[str, str]: + """Return an environment equivalent to activating a direct pixi env. + + Parameters + ---------- + pixi_env : str + Pixi environment name used by the job. + env : dict of str to str + Base process environment. + + Returns + ------- + dict of str to str + Environment with the pixi env's ``bin`` directory and compiler/CUDA + paths exposed when the job runs a direct Python binary. + """ + env_python = _pixi_env_python(pixi_env) + if env_python is None: + return env + + env_dir = Path(env_python).resolve().parent.parent + bin_dir = env_dir / "bin" + activated = dict(env) + activated["PATH"] = f"{bin_dir}{os.pathsep}{activated.get('PATH', '')}" + activated["CONDA_PREFIX"] = str(env_dir) + activated.setdefault("CUDA_HOME", str(env_dir)) + activated["PYTHONNOUSERSITE"] = "1" + return activated + + +def _pixi_project_dir() -> Path: + """Return the pixi project directory for env lookup and fallback pixi runs. + + Returns + ------- + Path + Project directory, defaulting to ``/app`` for the sampleworks image or + the current working directory outside that image. + """ + override = os.environ.get("SAMPLEWORKS_PIXI_PROJECT_DIR") + if override: + return Path(override) + app = Path("/app") + if (app / "pyproject.toml").exists(): + return app + return Path.cwd() + + +def _grid_search_script() -> str: + """Return the ``run_grid_search.py`` path used by worker jobs. + + Resolution is intentionally simple for the ACTL sampleworks image: the + baked image keeps a stable copy at :data:`DEFAULT_GRID_SEARCH_SCRIPT`, while + synced PR worktrees can point the runner at their checkout with + ``SAMPLEWORKS_GRID_SEARCH_SCRIPT=/home/dev/workspace/run_grid_search.py``. + + Returns + ------- + str + Path to execute with ``python`` inside each pixi environment. + """ + return os.environ.get("SAMPLEWORKS_GRID_SEARCH_SCRIPT", DEFAULT_GRID_SEARCH_SCRIPT) + + def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: """Launch every job in parallel and wait for completion. @@ -137,6 +236,11 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int: _print_dry_run(inv) return 0 + pixi_envs = sorted({inv.job.env for inv in invocations}) + for pixi_env in pixi_envs: + _prepare_pixi_env(pixi_env) + invocations = build_invocations(preset, results_dir=results_dir) + _print_launch_summary(preset, invocations) processes: list[_RunningJob] = [] try: @@ -165,6 +269,43 @@ def _terminate_all(jobs: list[_RunningJob]) -> None: j.tee_thread.join() +def _prepare_pixi_env(pixi_env: str) -> None: + """Prepare a pixi environment before parallel job launch. + + ``pixi run`` is deliberately called once per env even when the interpreter + directory already exists, because pixi may still need to materialize PyPI + packages into that environment after image startup. + + Parameters + ---------- + pixi_env : str + Pixi environment to prepare. + + Raises + ------ + subprocess.CalledProcessError + If pixi cannot prepare the environment. + """ + if os.environ.get("SAMPLEWORKS_SKIP_ENV_PREPARE", "").lower() in { + "1", + "true", + "yes", + }: + return + + env = { + **os.environ, + "PIXI_CACHE_DIR": os.environ.get("PIXI_CACHE_DIR", "/tmp/pixi-cache"), + "UV_CACHE_DIR": os.environ.get("UV_CACHE_DIR", "/tmp/uv-cache"), + } + cmd = ["pixi", "run", "-e", pixi_env, "python", "-c", "print('ready')"] + print( + f"[{_ts()}] preparing pixi env {pixi_env!r} with {shlex.join(cmd)}", + file=sys.stderr, + ) + subprocess.run(cmd, cwd=str(_pixi_project_dir()), env=env, check=True) + + def _print_dry_run(inv: JobInvocation) -> None: """Print the exact command for one job without launching it. diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py index 6e1b768a..b57a84e9 100644 --- a/src/sampleworks/runs/schema.py +++ b/src/sampleworks/runs/schema.py @@ -1,7 +1,7 @@ """Dataclasses for the preset schema. A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job -is launched as ``pixi run -e python /app/run_grid_search.py `` with +is launched as ``pixi run -e python `` with ``CUDA_VISIBLE_DEVICES`` set to the job's GPU assignment. """ diff --git a/src/sampleworks/utils/guidance_script_arguments.py b/src/sampleworks/utils/guidance_script_arguments.py index 9c2601fa..2876c06e 100644 --- a/src/sampleworks/utils/guidance_script_arguments.py +++ b/src/sampleworks/utils/guidance_script_arguments.py @@ -9,16 +9,34 @@ from sampleworks.utils.guidance_constants import GuidanceType, StructurePredictor -# Baked-in checkpoint paths (Docker image) with legacy fallbacks +# Baked-in checkpoint paths (Docker image), ACTL shared-storage paths, and +# legacy fallbacks. Environment variables win when present. +_CHECKPOINT_ENV_VARS = { + "boltz1": "BOLTZ1_CHECKPOINT", + "boltz2": "BOLTZ2_CHECKPOINT", + "rf3": "RF3_CHECKPOINT", + "protenix": "PROTENIX_CHECKPOINT", +} + _CHECKPOINT_CANDIDATES = { - "boltz1": ["/checkpoints/boltz1_conf.ckpt", "~/.boltz/boltz1_conf.ckpt"], - "boltz2": ["/checkpoints/boltz2_conf.ckpt", "~/.boltz/boltz2_conf.ckpt"], + "boltz1": [ + "/checkpoints/boltz1_conf.ckpt", + "/mnt/diffuse-shared/raw/checkpoints/boltz1_conf.ckpt", + "~/.boltz/boltz1_conf.ckpt", + ], + "boltz2": [ + "/checkpoints/boltz2_conf.ckpt", + "/mnt/diffuse-shared/raw/checkpoints/boltz2_conf.ckpt", + "~/.boltz/boltz2_conf.ckpt", + ], "rf3": [ "/checkpoints/rf3_foundry_01_24_latest.ckpt", + "/mnt/diffuse-shared/raw/checkpoints/rf3_foundry_01_24_latest.ckpt", "~/.foundry/checkpoints/rf3_foundry_01_24_latest.ckpt", ], "protenix": [ "/checkpoints/protenix_base_default_v0.5.0.pt", + "/mnt/diffuse-shared/raw/checkpoints/protenix_base_default_v0.5.0.pt", ".pixi/envs/protenix-dev/lib/python3.12/site-packages/release_data/checkpoint/protenix_base_default_v0.5.0.pt", ], } @@ -31,7 +49,11 @@ def _resolve_checkpoint(model_key: str) -> str: legacy development paths. If none are found the first candidate is returned so that downstream validation produces a clear error message. """ - candidates = _CHECKPOINT_CANDIDATES.get(model_key, []) + env_var = _CHECKPOINT_ENV_VARS.get(model_key) + candidates = [] + if env_var and os.environ.get(env_var): + candidates.append(os.environ[env_var]) + candidates.extend(_CHECKPOINT_CANDIDATES.get(model_key, [])) for candidate in candidates: resolved = Path(candidate).expanduser() if resolved.exists(): @@ -45,9 +67,10 @@ def _resolve_checkpoint(model_key: str) -> str: f"Provide --model-checkpoint or bake checkpoints into /checkpoints/." ) if not Path(resolved).exists(): + env_hint = _CHECKPOINT_ENV_VARS.get(model_key, "a checkpoint env var") raise ValueError( - f"Model checkpoint '{resolved}' does not exist. " - f"Provide a valid path via --model-checkpoint." + f"Model checkpoint for '{model_key}' was not found. Checked: {candidates}. " + f"Provide --model-checkpoint or set {env_hint}." ) return resolved diff --git a/src/sampleworks/utils/guidance_script_utils.py b/src/sampleworks/utils/guidance_script_utils.py index a72492ee..5d26a7c6 100644 --- a/src/sampleworks/utils/guidance_script_utils.py +++ b/src/sampleworks/utils/guidance_script_utils.py @@ -459,8 +459,14 @@ def _run_guidance(args: GuidanceConfig, guidance_type: str, model_wrapper, devic elif "Boltz" in wrapper_class_name: from sampleworks.models.boltz.wrapper import process_structure_for_boltz + # Boltz preprocessing writes manifest/NPZ/MSA files as a side effect. + # Keep those under the per-job output directory so concurrent grid jobs + # for the same protein do not race on a shared metadata-derived path. structure = process_structure_for_boltz( - structure, ensemble_size=args.ensemble_size, recycling_steps=recycling_steps + structure, + out_dir=args.output_dir, + ensemble_size=args.ensemble_size, + recycling_steps=recycling_steps, ) else: raise ValueError(f"Unknown model wrapper class: {wrapper_class_name}") diff --git a/tests/runs/conftest.py b/tests/runs/conftest.py new file mode 100644 index 00000000..a20482c0 --- /dev/null +++ b/tests/runs/conftest.py @@ -0,0 +1,11 @@ +"""Shared test fixtures for preset-runner tests.""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture(autouse=True) +def force_pixi_argv(monkeypatch: pytest.MonkeyPatch) -> None: + """Keep argv assertions deterministic on machines with /app/.pixi present.""" + monkeypatch.setenv("SAMPLEWORKS_FORCE_PIXI", "1") diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py index 6528a602..7a2bb1b1 100644 --- a/tests/runs/test_loader.py +++ b/tests/runs/test_loader.py @@ -43,6 +43,15 @@ def test_defaults_used_when_env_unset(monkeypatch: pytest.MonkeyPatch) -> None: assert preset.defaults["DATA_DIR"] == "/data/inputs" +def test_all_models_uses_canonical_inputs_dir(monkeypatch: pytest.MonkeyPatch) -> None: + """The flagship preset must use /data/inputs, matching the ACTL wrapper.""" + monkeypatch.delenv("DATA_DIR", raising=False) + monkeypatch.setenv("HOME", "/home/test") + preset = loader.load_preset("all_models") + assert preset.defaults["DATA_DIR"] == "/data/inputs" + assert preset.shared_args["proteins"] == "/data/inputs/proteins.csv" + + def test_set_override_at_defaults(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("DATA_DIR", raising=False) monkeypatch.setenv("HOME", "/home/test") diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py index 61c5b758..040ea0c1 100644 --- a/tests/runs/test_runner.py +++ b/tests/runs/test_runner.py @@ -100,6 +100,40 @@ def test_build_invocations_records_output_dir(monkeypatch: pytest.MonkeyPatch) - assert inv.output_dir == Path("/r/rf3") +def test_grid_search_script_can_be_overridden(monkeypatch: pytest.MonkeyPatch) -> None: + """ACTL wrappers can run the synced checkout instead of the baked /app copy.""" + monkeypatch.setenv("HOME", "/home/test") + monkeypatch.setenv("SAMPLEWORKS_GRID_SEARCH_SCRIPT", "/home/dev/workspace/run_grid_search.py") + preset = loader.load_preset("rf3_partial") + inv = runner.build_invocations(preset, results_dir=Path("/r"))[0] + assert inv.argv[:6] == [ + "pixi", + "run", + "-e", + "rf3", + "python", + "/home/dev/workspace/run_grid_search.py", + ] + + +def test_uses_baked_env_python_when_available( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """ACTL image runs bypass pixi cache refreshes by calling env Python directly.""" + monkeypatch.delenv("SAMPLEWORKS_FORCE_PIXI", raising=False) + monkeypatch.setenv("HOME", "/home/test") + pixi_project = tmp_path / "app" + python_bin = pixi_project / ".pixi" / "envs" / "rf3" / "bin" / "python" + python_bin.parent.mkdir(parents=True) + python_bin.write_text("#!/bin/sh\n") + python_bin.chmod(0o755) + monkeypatch.setenv("SAMPLEWORKS_PIXI_PROJECT_DIR", str(pixi_project)) + + preset = loader.load_preset("rf3_partial") + inv = runner.build_invocations(preset, results_dir=Path("/r"))[0] + assert inv.argv[:2] == [str(python_bin), "/app/run_grid_search.py"] + + def test_dry_run_does_not_create_directories( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From 6869dcbde735cc3313fc98abba0ab62921fe1c8c Mon Sep 17 00:00:00 2001 From: xraymemory Date: Thu, 21 May 2026 15:28:17 -0400 Subject: [PATCH 10/28] fix(runs): align experiment entrypoint and boltz tests --- Dockerfile | 8 +- GRID_SEARCH.md | 2 +- README.md | 44 ++----- run_all_models.sh | 132 +-------------------- run_experiments | 145 +++++++++++++++++++++++ run_experiments.sh | 17 +++ tests/models/boltz/test_boltz_wrapper.py | 4 +- 7 files changed, 183 insertions(+), 169 deletions(-) create mode 100755 run_experiments create mode 100755 run_experiments.sh diff --git a/Dockerfile b/Dockerfile index c7ed94d7..f6d8f495 100644 --- a/Dockerfile +++ b/Dockerfile @@ -129,10 +129,12 @@ RUN pixi run -e boltz python -c "\ from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)" -COPY run_all_models.sh ./ -RUN chmod +x /app/run_all_models.sh \ +COPY run_experiments run_experiments.sh run_all_models.sh ./ +RUN chmod +x /app/run_experiments /app/run_experiments.sh /app/run_all_models.sh \ + && printf '#!/usr/bin/env bash\nexec /app/run_experiments "$@"\n' > /usr/local/bin/run_experiments \ + && printf '#!/usr/bin/env bash\nexec /app/run_experiments.sh "$@"\n' > /usr/local/bin/run_experiments.sh \ && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \ - && chmod +x /usr/local/bin/run_all_models.sh \ + && chmod +x /usr/local/bin/run_experiments /usr/local/bin/run_experiments.sh /usr/local/bin/run_all_models.sh \ && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n cd /app\nfi\n' >> /root/.bashrc # Set default checkpoint paths via environment variables diff --git a/GRID_SEARCH.md b/GRID_SEARCH.md index 154d00c5..4ae15630 100644 --- a/GRID_SEARCH.md +++ b/GRID_SEARCH.md @@ -6,7 +6,7 @@ and how to find and read logs if you need to debug the process. ## Optional: Setting up the docker container It is often useful to have a docker container with all the dependencies installed. -Our script `run_all_models.sh` for instance uses a docker container to manage all +Our script `run_experiments` for instance uses a docker container to manage all dependencies. To run that script, you will need to have docker installed. Build the container with ```shell diff --git a/README.md b/README.md index dade48e7..73c4783b 100644 --- a/README.md +++ b/README.md @@ -152,53 +152,31 @@ Output layout: `grid_search_results//[_]//ens Instructions for running evaluation and metrics scripts are coming soon. -## ACTL preset experiments (`run_all_models.sh` / `sampleworks-runs`) +## ACTL preset experiments (`run_experiments`) -For canonical multi-model/multi-GPU sweeps, `sampleworks-runs` orchestrates parallel `run_grid_search.py` jobs from a single TOML preset. Each preset declares the model, pixi env, GPU assignment, output subdir, and CLI args. The runner launches jobs in parallel, tees per-job logs, and aggregates exit codes. - -On ACTL, start the pod with the prebuilt image and shared storage, then run one command inside the pod shell: +Use ACTL to get a ready-to-run Sampleworks pod with 8 GPUs and the shared data PVC: ```bash actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes - -# inside the ACTL pod shell -# the sampleworks image drops interactive shells in /app -run_all_models.sh --dry-run # inspect commands first -run_all_models.sh # run /app/src/sampleworks/runs/presets/all_models.toml ``` -The wrapper keeps the TOML preset as the source of truth. It only supplies ACTL-friendly defaults: - -- `DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps` -- `RESULTS_DIR=/mnt/diffuse-shared/results/sampleworks//all_models` -- `MSA_CACHE_DIR=/mnt/diffuse-shared/cache/sampleworks/msa` -- `PYTHONPATH=/app/src`, using the copy baked into the sampleworks image -- direct `/app/.pixi/envs//bin/python` execution, so it reuses the environments baked into the sampleworks image without refreshing pixi caches -- `/tmp` pixi/uv caches for any missing environment preparation, avoiding shared-storage Git cache issues - -Common commands: +Inside the pod shell (`/app`), run: ```bash -run_all_models.sh --list # bundled presets -run_all_models.sh all_models --show # inspect resolved values -run_all_models.sh all_models --only rf3,protenix # subset jobs -run_all_models.sh rf3_partial # run a smaller preset - -# Override paths or parameters without editing TOML: -DATA_DIR=/mnt/diffuse-shared/raw/sampleworks/my_dataset run_all_models.sh rf3_partial -run_all_models.sh rf3_partial \ - --set jobs.rf3.gpus=0 \ - --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02" +run_experiments --dry-run +run_experiments all_models ``` -Bundled presets live in `src/sampleworks/runs/presets/*.toml`. You can also copy one, edit it, and run it by path: +`run_experiments` is a thin wrapper around `sampleworks-runs`: it reads TOML presets and launches the requested `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job. The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2 MD, RF3, and Protenix. + +Presets live in `/app/src/sampleworks/runs/presets/*.toml` (same path in the repo: `src/sampleworks/runs/presets/`). To change an experiment, either edit/copy a preset or override values at launch: ```bash -cp src/sampleworks/runs/presets/all_models.toml my_experiment.toml -run_all_models.sh ./my_experiment.toml +run_experiments all_models --only rf3,protenix +run_experiments rf3_partial --set jobs.rf3.gpus=0 ``` -Env-var defaults (`DATA_DIR`, `RESULTS_DIR`, `MSA_CACHE_DIR`, `PROTEINS_CSV`) declared per preset are filled from the process environment when set, otherwise from the preset's `[defaults]` block. +The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks///`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations. `run_all_models.sh` remains as a compatibility alias. ## Docker diff --git a/run_all_models.sh b/run_all_models.sh index 96b032b0..f1f81b8c 100755 --- a/run_all_models.sh +++ b/run_all_models.sh @@ -1,10 +1,5 @@ #!/usr/bin/env bash -# ACTL-native entry point for Sampleworks preset runs. -# -# The TOML preset is the source of truth. This wrapper only supplies smooth -# pod defaults: persistent /mnt paths, the synced PR source tree on PYTHONPATH, -# and direct use of the prebuilt pixi environments from the image at /app. - +# Backward-compatible alias. Prefer run_experiments for new docs/usage. set -euo pipefail script_path="${BASH_SOURCE[0]}" @@ -18,128 +13,5 @@ while [[ -L "$script_path" ]]; do fi done script_dir="$(cd -- "$(dirname -- "$script_path")" && pwd)" -repo_root="${SAMPLEWORKS_APP_DIR:-$script_dir}" - -preset="${SAMPLEWORKS_PRESET:-all_models}" -if [[ $# -gt 0 && "$1" != -* ]]; then - preset="$1" - shift -fi - -if [[ "$preset" == *.toml || "$preset" == */* ]]; then - if [[ "$preset" != /* ]]; then - preset="$repo_root/$preset" - fi -fi -preset_label="${preset##*/}" -preset_label="${preset_label%.toml}" - -run_name="${SAMPLEWORKS_ACTL_RUN_NAME:-$(hostname -s 2>/dev/null || printf 'sampleworks')}" -default_data_dir="/mnt/diffuse-shared/raw/sampleworks/initial_dataset_40_occ_sweeps" -default_results_dir="/mnt/diffuse-shared/results/sampleworks/${run_name}/${preset_label}" -default_msa_cache_dir="/mnt/diffuse-shared/cache/sampleworks/msa" - -export DATA_DIR="${DATA_DIR:-${SAMPLEWORKS_DATA_DIR:-$default_data_dir}}" -export RESULTS_DIR="${RESULTS_DIR:-${SAMPLEWORKS_RESULTS_DIR:-$default_results_dir}}" -export MSA_CACHE_DIR="${MSA_CACHE_DIR:-${SAMPLEWORKS_MSA_CACHE_DIR:-$default_msa_cache_dir}}" -export SAMPLEWORKS_GRID_SEARCH_SCRIPT="${SAMPLEWORKS_GRID_SEARCH_SCRIPT:-$repo_root/run_grid_search.py}" -export PYTHONPATH="$repo_root/src${PYTHONPATH:+:$PYTHONPATH}" -export PIXI_CACHE_DIR="${PIXI_CACHE_DIR:-/tmp/pixi-cache}" -export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv-cache}" - -shared_checkpoint_dir="/mnt/diffuse-shared/raw/checkpoints" -for checkpoint_var_and_file in \ - "BOLTZ1_CHECKPOINT boltz1_conf.ckpt" \ - "BOLTZ2_CHECKPOINT boltz2_conf.ckpt" \ - "RF3_CHECKPOINT rf3_foundry_01_24_latest.ckpt" \ - "PROTENIX_CHECKPOINT protenix_base_default_v0.5.0.pt"; do - read -r checkpoint_var checkpoint_file <<<"$checkpoint_var_and_file" - checkpoint_path="$shared_checkpoint_dir/$checkpoint_file" - if [[ -z "${!checkpoint_var:-}" && -f "$checkpoint_path" ]]; then - export "$checkpoint_var=$checkpoint_path" - fi -done - -source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}" -if [[ -f "$source_proteins_csv" ]]; then - # The shared proteins.csv currently contains absolute /data/inputs paths, - # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run - # manifest instead of requiring non-root scientists to create /data symlinks. - manifest_dir="$RESULTS_DIR/_input_manifest" - manifest_proteins_csv="$manifest_dir/proteins.csv" - mkdir -p "$manifest_dir" - legacy_data_dir="/data/inputs" - while IFS= read -r line || [[ -n "$line" ]]; do - printf '%s\n' "${line//$legacy_data_dir/$DATA_DIR}" - done <"$source_proteins_csv" >"$manifest_proteins_csv" - export PROTEINS_CSV="$manifest_proteins_csv" -fi - -runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}" -pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}" -if [[ -z "$pixi_project_dir" ]]; then - if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then - pixi_project_dir="/app" - else - pixi_project_dir="$repo_root" - fi -fi -runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}" - -needs_runtime_paths=1 -for arg in "$@"; do - case "$arg" in - --dry-run|--show|--list|-h|--help) - needs_runtime_paths=0 - ;; - esac -done - -if [[ "$needs_runtime_paths" -eq 1 ]]; then - if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then - cat >&2 < ./run_all_models.sh - -EOF - exit 2 - fi - mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR" -fi - -cat >&2 <"$manifest_proteins_csv" + export PROTEINS_CSV="$manifest_proteins_csv" +fi + +runner_env="${SAMPLEWORKS_RUNNER_ENV:-rf3}" +pixi_project_dir="${SAMPLEWORKS_PIXI_PROJECT_DIR:-}" +if [[ -z "$pixi_project_dir" ]]; then + if [[ -f /app/pyproject.toml && -d /app/.pixi ]]; then + pixi_project_dir="/app" + else + pixi_project_dir="$repo_root" + fi +fi +runner_python="${SAMPLEWORKS_RUNNER_PYTHON:-$pixi_project_dir/.pixi/envs/$runner_env/bin/python}" + +needs_runtime_paths=1 +for arg in "$@"; do + case "$arg" in + --dry-run|--show|--list|-h|--help) + needs_runtime_paths=0 + ;; + esac +done + +if [[ "$needs_runtime_paths" -eq 1 ]]; then + if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then + cat >&2 < ./run_experiments + +EOF + exit 2 + fi + mkdir -p "$RESULTS_DIR" "$MSA_CACHE_DIR" +fi + +cat >&2 < Date: Fri, 22 May 2026 11:11:38 -0400 Subject: [PATCH 11/28] fix(runs): prefer synced workspace source --- Dockerfile | 27 +++--- GRID_SEARCH.md | 4 +- README.md | 13 ++- docker-entrypoint.sh | 50 ++++++------ run_experiments | 59 ++++++++++++-- run_grid_search.py | 23 +++++- src/sampleworks/runs/runner.py | 145 +++++++++++++++++++++++++++++++-- tests/runs/test_runner.py | 37 +++++++++ 8 files changed, 303 insertions(+), 55 deletions(-) diff --git a/Dockerfile b/Dockerfile index f6d8f495..04e29ad9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ # Checkpoints are baked into the image at /checkpoints/ via a pre-built base image. # # Build: -# docker build -t sampleworks . +# docker build -t pixi-with-checkpoints . # # CI builds pull checkpoints automatically from Harbor via: # COPY --from=harbor.astera.sh/library/sampleworks-checkpoints:latest @@ -16,10 +16,10 @@ # # Run examples: # # Show help -# docker run sampleworks --help +# docker run pixi-with-checkpoints --help # # # Run grid search with Boltz1 (checkpoint baked in) -# docker run --gpus all -v /data:/data sampleworks \ +# docker run --gpus all -v /data:/data pixi-with-checkpoints \ # -e boltz run_grid_search.py \ # --proteins /data/proteins.csv \ # --models boltz1 \ @@ -33,7 +33,7 @@ # --align-to-input # # # Run grid search with Boltz2 (checkpoint baked in) -# docker run --gpus all -v /data:/data sampleworks \ +# docker run --gpus all -v /data:/data pixi-with-checkpoints \ # -e boltz run_grid_search.py \ # --proteins /data/proteins.csv \ # --models boltz2 \ @@ -45,7 +45,7 @@ # --use-tweedie # # # Interactive shell -# docker run --gpus all -it sampleworks bash +# docker run --gpus all -it pixi-with-checkpoints bash # # Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest): # /checkpoints/boltz1_conf.ckpt - Boltz1 model (~3.5GB) @@ -129,13 +129,16 @@ RUN pixi run -e boltz python -c "\ from sampleworks.core.forward_models.xray.real_space_density_deps.ops import dilate_atom_centric; \ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-compilation skipped (no GPU during build)" -COPY run_experiments run_experiments.sh run_all_models.sh ./ -RUN chmod +x /app/run_experiments /app/run_experiments.sh /app/run_all_models.sh \ - && printf '#!/usr/bin/env bash\nexec /app/run_experiments "$@"\n' > /usr/local/bin/run_experiments \ - && printf '#!/usr/bin/env bash\nexec /app/run_experiments.sh "$@"\n' > /usr/local/bin/run_experiments.sh \ - && printf '#!/usr/bin/env bash\nexec /app/run_all_models.sh "$@"\n' > /usr/local/bin/run_all_models.sh \ - && chmod +x /usr/local/bin/run_experiments /usr/local/bin/run_experiments.sh /usr/local/bin/run_all_models.sh \ - && printf '\n# ACTL scientist workflow: land in the baked Sampleworks app.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /app ]; then\n cd /app\nfi\n' >> /root/.bashrc +# This image carries pixi environments and checkpoints. Runtime source should +# come from ACTL's synced checkout at /home/dev/workspace, not from stale code +# baked into /app during image construction. +RUN rm -rf /app/src /app/scripts /app/run_grid_search.py \ + && mkdir -p /home/dev/workspace + +COPY --chmod=755 run_experiments run_experiments.sh run_all_models.sh /usr/local/bin/ +RUN printf '\n# ACTL scientist workflow: land in the synced Sampleworks checkout.\nif [[ $- == *i* ]] && [ -z "${SAMPLEWORKS_NO_AUTO_CD:-}" ] && [ -d /home/dev/workspace ]; then\n cd /home/dev/workspace\nfi\n' >> /root/.bashrc + +ENV SAMPLEWORKS_PIXI_PROJECT_DIR=/app # Set default checkpoint paths via environment variables ENV BOLTZ1_CHECKPOINT=/checkpoints/boltz1_conf.ckpt \ diff --git a/GRID_SEARCH.md b/GRID_SEARCH.md index 4ae15630..b8f9aa73 100644 --- a/GRID_SEARCH.md +++ b/GRID_SEARCH.md @@ -10,10 +10,10 @@ Our script `run_experiments` for instance uses a docker container to manage all dependencies. To run that script, you will need to have docker installed. Build the container with ```shell -docker build -t diffuseproject/sampleworks . +docker build -t pixi-with-checkpoints . ``` which will add an image to your local docker repository called -`diffuseproject/sampleworks:latest`. The top of the `Dockerfile` contains +`pixi-with-checkpoints:latest`. The top of the `Dockerfile` contains instructions on how to use the container as well. The container entrypoint (`docker-entrypoint`) is fairly generic and is used to call the `run_grid_search.py` script described below. diff --git a/README.md b/README.md index 73c4783b..07a95646 100644 --- a/README.md +++ b/README.md @@ -154,13 +154,15 @@ Instructions for running evaluation and metrics scripts are coming soon. ## ACTL preset experiments (`run_experiments`) -Use ACTL to get a ready-to-run Sampleworks pod with 8 GPUs and the shared data PVC: +Use ACTL to get a ready-to-run pod with baked pixi environments, checkpoints, +and the shared data PVC: ```bash -actl pod up sampleworks-pr236 --profile 8x --image sampleworks --storage shared --pvc-size 200Gi --mount diffuse-shared --yes +actl pod up sampleworks-pr236 --profile 8x --image harbor.astera.sh/library/pixi-with-checkpoints:cuda12.4-2026-05-21-pr240-workspace1 --storage shared --pvc-size 200Gi --mount diffuse-shared --yes ``` -Inside the pod shell (`/app`), run: +ACTL syncs your local checkout to `/home/dev/workspace`; interactive shells land +there. Run experiments from that synced checkout, not from `/app`: ```bash run_experiments --dry-run @@ -169,13 +171,16 @@ run_experiments all_models `run_experiments` is a thin wrapper around `sampleworks-runs`: it reads TOML presets and launches the requested `run_grid_search.py` jobs in parallel, with `CUDA_VISIBLE_DEVICES` set per job. The default preset is `all_models`, which splits GPUs across Boltz2 XRD, Boltz2 MD, RF3, and Protenix. -Presets live in `/app/src/sampleworks/runs/presets/*.toml` (same path in the repo: `src/sampleworks/runs/presets/`). To change an experiment, either edit/copy a preset or override values at launch: +Presets live in the synced repo at `src/sampleworks/runs/presets/*.toml`. To change an experiment, either edit/copy a preset locally and let ACTL sync it, or override values at launch: ```bash run_experiments all_models --only rf3,protenix run_experiments rf3_partial --set jobs.rf3.gpus=0 ``` +On smaller pods, make sure preset GPU IDs only reference visible pod GPUs +(`0..N-1`). `run_experiments` fails fast if a preset requests unavailable GPUs. + The shared inputs are under `/mnt/diffuse-shared/raw/sampleworks/...`; checkpoints are in `/mnt/diffuse-shared/raw/checkpoints`; default results go to `/mnt/diffuse-shared/results/sampleworks///`; MSA caches go to `/mnt/diffuse-shared/cache/sampleworks/msa`. Set `DATA_DIR`, `RESULTS_DIR`, or `MSA_CACHE_DIR` before running to change these locations. `run_all_models.sh` remains as a compatibility alias. diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 957c351d..0477a7dc 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -2,15 +2,15 @@ # Sampleworks Docker Entrypoint # # Usage: -# docker run sampleworks -e