From 2bae0fe47d1d59249c970555a6384f7a4b86163a Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Tue, 9 Jun 2026 11:16:24 -0500 Subject: [PATCH] README routing for personal-endpoint case + graceful remote timeout handling Docs: - README router now distinguishes "your own personal endpoint" (case 3) from "shared/group endpoint operator" (case 4). The first was previously folded into the sysadmin path and underserved. - docs/operating-an-endpoint.md opens with a 30-minute solo personal quickstart (six commands plus one config edit) before the full 8-step shared setup, with clear pointers to when each piece of hardening becomes mandatory. - AGENTS.md adds an "upfront remote expectation setting" rule so the agent surfaces target endpoint, queue characteristics, and timeout to the user before invoking any use_remote=True tool. Code: - remote/health.py: replace `with Executor(...) as ex:` with explicit try/finally + ex.shutdown(wait=False). The context manager drained pending futures on exit, which caused probes against a slow or wedged endpoint to hang for the full timeout (and sometimes much longer) instead of returning. - tools/remote_tools.py: wrap the remote call in try/except so a TimeoutError becomes an actionable message ("job likely waiting in Slurm/PBS queue, raise timeout_seconds or wait") instead of a bare traceback. Lock: - uv.lock catches up to the 0.1.1 package version already in pyproject.toml. --- AGENTS.md | 3 + README.md | 13 +++- docs/operating-an-endpoint.md | 102 ++++++++++++++++++++++++-- src/uxarray_mcp/remote/health.py | 7 +- src/uxarray_mcp/tools/remote_tools.py | 17 ++++- uv.lock | 2 +- 6 files changed, 131 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index be18f65..0f4cbb9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,6 +60,9 @@ Documentation: `docs/` (Sphinx, built to ReadTheDocs). and check for empty PNG bytes after rendering. Raise `ValueError` with a clear message pointing to the likely cause. +- **Upfront remote expectation setting** — before invoking any tool with `use_remote=True`, the AI agent must inform the user right upfront in the text response about the target HPC endpoint, potential queue wait times, and the active timeout configuration. This ensures transparency when jobs are queued on batch schedulers (Slurm/PBS). + + ## Repository layout ``` diff --git a/README.md b/README.md index b4efe7e..d75b9d3 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,15 @@ You are most likely one of: 2. **HPC user, endpoint already exists** — someone at your lab gave you a Globus Compute endpoint UUID. → [Local install](#local-install), then [docs/remote-hpc.md](docs/remote-hpc.md) (15 min). -3. **HPC user, no endpoint yet** — you have shell access to ANL, NCAR, NERSC, - etc., and need to stand one up. → [Local install](#local-install), - then [docs/operating-an-endpoint.md](docs/operating-an-endpoint.md) (~1 hr, - site-dependent). +3. **HPC user, your own personal endpoint** — you have a Globus identity and + shell access to an HPC machine, and want to stand up an endpoint just for + yourself. → [Local install](#local-install), then + [docs/operating-an-endpoint.md](docs/operating-an-endpoint.md#solo-personal-endpoint-quickstart) + (~30 min). +4. **Group / shared endpoint operator** — you're standing one up for a team, + project, or lab. → [Local install](#local-install), then the full + [docs/operating-an-endpoint.md](docs/operating-an-endpoint.md) including + service-account migration and the MEP allowlist (~1 hr+, site-dependent). --- diff --git a/docs/operating-an-endpoint.md b/docs/operating-an-endpoint.md index e398bc1..80c9eca 100644 --- a/docs/operating-an-endpoint.md +++ b/docs/operating-an-endpoint.md @@ -1,9 +1,17 @@ # Operating an Endpoint -This page is for the person who stands up the Globus Compute endpoint on -the HPC machine. That might be you (for personal use) or a sysadmin / PI -(for a group). **Read [SECURITY.md](https://github.com/UXARRAY/uxarray-mcp-server/blob/main/SECURITY.md) first** — operating an -endpoint is shell-equivalent delegation, not a casual config change. +This page covers two related cases: + +- **You want a personal endpoint** that only you submit to. Skip to + [Solo personal endpoint quickstart](#solo-personal-endpoint-quickstart) below + (~30 min). The full 8-step section is overkill for this case. +- **You're standing up a shared endpoint** for a team, project, or lab. Use the + full [eight-step setup](#the-eight-steps) including a service account and the + function allowlist (~1 hr+). + +**Read [SECURITY.md](https://github.com/UXARRAY/uxarray-mcp-server/blob/main/SECURITY.md) first** — even a personal endpoint is shell-equivalent access for +whoever can submit to it (which includes a stolen refresh token from your laptop), +so the hardening basics still apply. > **Prerequisites:** > 1. Shell access to the HPC machine. @@ -37,6 +45,90 @@ the endpoint's Globus Auth allow-list. Do this with eyes open. --- +## Solo personal endpoint quickstart + +If only **you** will submit to this endpoint, you can skip the service-account +ticket, the multi-user setup, and the function allowlist. The minimum viable +personal endpoint is six commands and one config edit. + +**Prereqs:** shell on the HPC machine, a Globus identity, your project's +Slurm account or PBS project ID, and the site's conda/module convention. + +```bash +# 1. On the HPC machine, in your account +module load conda # or whatever your site provides +conda create -n gce python=3.11 -c conda-forge -y +conda activate gce +pip install globus-compute-endpoint uxarray xarray netCDF4 h5netcdf + +# 2. Configure the endpoint +globus-compute-endpoint configure uxarray +``` + +Edit `~/.globus_compute/uxarray/config.yaml` and set the scheduler block. +Minimum diff from the generated template (PBS example shown — see Step 3 +below for Slurm): + +```yaml +display_name: uxarray +engine: + type: GlobusComputeEngine + provider: + type: PBSProProvider + queue: casper # or your site's queue + account: YOUR_PROJECT_ID # critical — without this, jobs reject + nodes_per_block: 1 + init_blocks: 1 + min_blocks: 0 + max_blocks: 1 + walltime: "01:00:00" + worker_init: | + unset PYTHONPATH # critical — see Step 3 for why + module load conda + conda activate gce +``` + +Lock down the auth so only **your** Globus identity can submit (find your +identity UUID at ): + +```yaml +authentication_policy: + high_assurance: true # recent MFA required + allowed_identities: + - your-globus-identity-uuid +``` + +Then: + +```bash +# 3. Start the endpoint (opens a browser tab for OAuth on first run) +globus-compute-endpoint start uxarray +# → registers and prints your endpoint UUID. Save it. + +# 4. Lock down credential storage +chmod 700 ~/.globus_compute ~/.globus +``` + +**Test from your laptop:** + +```bash +uxarray-mcp endpoints add mine --path-prefix /glade/ # or /lcrc/, /gpfs/... +uxarray-mcp doctor --endpoint mine +``` + +If `doctor` reports `active`, you're done. Total time: ~30 min the first time. + +**You should still do the full hardening eventually:** + +- If anyone else will ever submit (collaborator, student, agent on a shared + laptop) → migrate to a service account ([Step 1](#step-1--pick-the-user-account)). +- For long-running endpoints → add the function allowlist + ([MEP allowlist](#mep-allowlist)) and the audit cron + ([Step 7](#step-7--harden)). +- Rotate credentials every 90 days ([Day-2 ops](#day-2-operations)). + +--- + ## The eight steps 1. [Pick the user account](#step-1--pick-the-user-account) — service or personal @@ -267,7 +359,7 @@ Restart: globus-compute-endpoint restart uxarray ``` -#### MEP allowlist +### MEP allowlist For the strongest protection, convert to a **Multi-User Endpoint** with a function allowlist. Pre-register the ~20 functions uxarray-mcp uses; the diff --git a/src/uxarray_mcp/remote/health.py b/src/uxarray_mcp/remote/health.py index 57530e8..849f5a6 100644 --- a/src/uxarray_mcp/remote/health.py +++ b/src/uxarray_mcp/remote/health.py @@ -298,12 +298,15 @@ def _worker_probe() -> dict: message=r"(?s).*Environment differences detected between local SDK and endpoint.*", category=UserWarning, ) - with Executor( + ex = Executor( endpoint_id=endpoint_id, serializer=ComputeSerializer(strategy_code=AllCodeStrategies()), - ) as ex: + ) + try: fut = ex.submit(_worker_probe) result = fut.result(timeout=timeout_seconds) + finally: + ex.shutdown(wait=False) elapsed = round(time.monotonic() - t0, 1) pythonpath = result.get("pythonpath") or "" diff --git a/src/uxarray_mcp/tools/remote_tools.py b/src/uxarray_mcp/tools/remote_tools.py index d2f7cf4..bda4792 100644 --- a/src/uxarray_mcp/tools/remote_tools.py +++ b/src/uxarray_mcp/tools/remote_tools.py @@ -142,7 +142,22 @@ def _run_with_optional_hpc( tracker.stage( "submitted", f"Submitting {tool_name} to the HPC endpoint {endpoint_label}." ) - result = remote_call(agent) + try: + result = remote_call(agent) + except (TimeoutError, concurrent.futures.TimeoutError) as exc: + msg = ( + f"Remote execution of {tool_name} timed out after {agent.config.timeout_seconds} seconds. " + f"This usually means the job was submitted to the remote queue on endpoint '{endpoint_label}' but is " + "waiting in the scheduler queue (Slurm/PBS), or the compute nodes are busy. " + "You can increase the timeout in config.yaml or wait and try again." + ) + tracker.fail(msg) + raise RuntimeError(msg) from exc + except Exception as exc: + msg = f"Remote execution of {tool_name} failed: {exc}" + tracker.fail(msg) + raise + result["_provenance"]["operation_id"] = tracker.operation_id tracker.succeed(f"{tool_name} completed with remote execution.") return result diff --git a/uv.lock b/uv.lock index 9edcbe2..1f4cb9f 100644 --- a/uv.lock +++ b/uv.lock @@ -4337,7 +4337,7 @@ wheels = [ [[package]] name = "uxarray-mcp" -version = "0.1.0" +version = "0.1.1" source = { editable = "." } dependencies = [ { name = "fastmcp" },