diff --git a/docs/operating-an-endpoint.md b/docs/operating-an-endpoint.md index 80c9eca..2a9b40a 100644 --- a/docs/operating-an-endpoint.md +++ b/docs/operating-an-endpoint.md @@ -57,7 +57,7 @@ Slurm account or PBS project ID, and the site's conda/module convention. ```bash # 1. On the HPC machine, in your account module load conda # or whatever your site provides -conda create -n gce python=3.11 -c conda-forge -y +conda create -n gce python=3.12 -c conda-forge -y conda activate gce pip install globus-compute-endpoint uxarray xarray netCDF4 h5netcdf @@ -65,6 +65,17 @@ pip install globus-compute-endpoint uxarray xarray netCDF4 h5netcdf globus-compute-endpoint configure uxarray ``` +> **Pick the worker Python version deliberately.** Globus Compute serialization +> tolerates same-minor skew (e.g. 3.12.10 ↔ 3.12.13) but breaks across minor +> versions (3.12 ↔ 3.13). The packaged MCP tools route through +> `AllCodeStrategies` and survive minor differences for simple payloads, but +> raw `Executor.submit()` calls and any user code dropping down to the SDK +> directly will hit pickle protocol errors and `WorkerLost`. The simplest +> rule: pick a Python on the worker (3.12 is broadly available across HPC +> sites today) and match it on the submitter side with +> `uv tool install --python 3.12 uxarray-mcp`. `uxarray-mcp doctor` will +> surface a warning at probe time when versions differ. + Edit `~/.globus_compute/uxarray/config.yaml` and set the scheduler block. Minimum diff from the generated template (PBS example shown — see Step 3 below for Slurm): diff --git a/src/uxarray_mcp/remote/health.py b/src/uxarray_mcp/remote/health.py index 849f5a6..309d84b 100644 --- a/src/uxarray_mcp/remote/health.py +++ b/src/uxarray_mcp/remote/health.py @@ -42,6 +42,7 @@ from __future__ import annotations +import sys import threading import time import warnings @@ -311,25 +312,67 @@ def _worker_probe() -> dict: elapsed = round(time.monotonic() - t0, 1) pythonpath = result.get("pythonpath") or "" yac_pythonpath = _is_expected_yac_pythonpath(pythonpath) + worker_python = result.get("python", "") payload = { "status": "active", **_endpoint_public_fields(config), "node": result.get("node", ""), - "python": result.get("python", ""), + "python": worker_python, "slurm_job_id": result.get("slurm_job_id") or None, "pbs_job_id": result.get("pbs_job_id") or None, "pythonpath_set": bool(pythonpath), "pythonpath_expected_yac_runtime": yac_pythonpath, "elapsed_seconds": elapsed, } + + warnings_list: list[str] = [] + + # Python-minor-version mismatch between submitter and worker is a + # subtle footgun: high-level MCP tools route through AllCodeStrategies + # and tolerate skew, but raw Executor.submit() with default + # serialization will hit pickle protocol differences and raise + # WorkerLost. Surface this at probe time so users see it before they + # build something that breaks in production. + if worker_python: + try: + worker_major, worker_minor = ( + int(p) for p in worker_python.split(".")[:2] + ) + except (ValueError, IndexError): + worker_major, worker_minor = None, None + if ( + worker_major is not None + and worker_minor is not None + and ( + worker_major != sys.version_info.major + or worker_minor != sys.version_info.minor + ) + ): + warnings_list.append( + f"Python minor-version mismatch: SDK " + f"{sys.version_info.major}.{sys.version_info.minor}, worker " + f"{worker_major}.{worker_minor}. The packaged MCP tools work " + f"because they ship function source via AllCodeStrategies, " + f"but raw Executor.submit(fn) using default serialization " + f"may raise WorkerLost. Reinstall locally on the worker's " + f"Python: `uv tool install --python " + f"{worker_major}.{worker_minor} uxarray-mcp` or, in a dev " + f"clone, `uv sync --python {worker_major}.{worker_minor}`." + ) + # Warn on arbitrary PYTHONPATH leaks, but allow endpoint-side YAC paths. if pythonpath and not yac_pythonpath: - payload["warning"] = ( + warnings_list.append( "PYTHONPATH is set on the worker. This can cause pydantic/dill " "conflicts. Add 'unset PYTHONPATH' to worker_init in the endpoint " "config, and only set narrow runtime paths such as the YAC " "Python bindings when they are required." ) + + if warnings_list: + payload["warnings"] = warnings_list + # Back-compat: keep singular `warning` populated with the first. + payload["warning"] = warnings_list[0] return payload except TimeoutError: