-
Notifications
You must be signed in to change notification settings - Fork 438
DRAFT fix: Nano-v3 recipe run fix. #2867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9ff5657
af874c4
8262980
cfc93f4
875e702
d35e2d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| # limitations under the License. | ||
| import os | ||
| import subprocess | ||
| import sys | ||
| from pathlib import Path | ||
| from typing import Any, Dict, List, NotRequired, TypedDict | ||
|
|
||
|
|
@@ -38,6 +39,30 @@ | |
| DEFAULT_THINKING_TAGS = ["<think>", "</think>"] | ||
|
|
||
|
|
||
| def _ensure_nemo_gym_package_precedence() -> None: | ||
| """Prefer the third-party NeMo-Gym package over examples/nemo_gym.""" | ||
| repo_root = Path(__file__).resolve().parents[2] | ||
| gym_workspace = repo_root / "3rdparty" / "Gym-workspace" / "Gym" | ||
| gym_init = gym_workspace / "nemo_gym" / "__init__.py" | ||
| if not gym_init.exists(): | ||
| return | ||
|
|
||
| gym_workspace_str = str(gym_workspace) | ||
| if sys.path[:1] != [gym_workspace_str]: | ||
| sys.path[:] = [p for p in sys.path if p != gym_workspace_str] | ||
| sys.path.insert(0, gym_workspace_str) | ||
|
|
||
| shadowed_module = sys.modules.get("nemo_gym") | ||
| if ( | ||
| shadowed_module is not None | ||
| and getattr(shadowed_module, "__file__", None) is None | ||
| and not hasattr(shadowed_module, "PARENT_DIR") | ||
| ): | ||
| for module_name in list(sys.modules): | ||
| if module_name == "nemo_gym" or module_name.startswith("nemo_gym."): | ||
| del sys.modules[module_name] | ||
|
|
||
|
|
||
| def get_nemo_gym_uv_cache_dir() -> str | None: | ||
| """Return the uv cache directory inside a container, or None outside one. | ||
|
|
||
|
|
@@ -158,6 +183,8 @@ def _spinup(self) -> None: | |
| _gym_port_high = self.cfg.get("port_range_high", DEFAULT_GYM_PORT_RANGE_HIGH) | ||
| self.head_server_port = _get_free_port_local(_gym_port_low, _gym_port_high) | ||
|
|
||
| _ensure_nemo_gym_package_precedence() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When does this happen? |
||
|
|
||
| from nemo_gym.cli import GlobalConfigDictParserConfig, RunHelper | ||
| from nemo_gym.rollout_collection import RolloutCollectionHelper | ||
| from nemo_gym.server_utils import HEAD_SERVER_KEY_NAME, BaseServerConfig | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -722,15 +722,22 @@ async def create_chat_completion( | |||||||||||
| generator = await openai_serving_chat.create_chat_completion( | ||||||||||||
| request, raw_request | ||||||||||||
| ) | ||||||||||||
| except VLLMValidationError as e: | ||||||||||||
| except (ValueError, VLLMValidationError) as e: | ||||||||||||
| # vLLM 0.20 raises VLLMValidationError for prompts exceeding | ||||||||||||
| # max_model_len during tokenization, instead of returning an | ||||||||||||
| # ErrorResponse. Convert to HTTP 400 so the Gym proxy can | ||||||||||||
| # detect context-length overflow and handle it gracefully. | ||||||||||||
| # ErrorResponse. Our post-tokenization clamp can raise a local | ||||||||||||
| # ValueError for the same condition after prefix replacement. | ||||||||||||
| # Convert those cases to HTTP 400 so the Gym proxy can detect | ||||||||||||
| # context-length overflow and handle it gracefully. | ||||||||||||
| message = str(e) | ||||||||||||
| if isinstance(e, ValueError) and not ( | ||||||||||||
| "max_model_len" in message or "maximum context length" in message | ||||||||||||
| ): | ||||||||||||
| raise | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Consider gating the substring filter so it only applies to plain
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, do you have examples for when the current behavior is a problem? |
||||||||||||
| return JSONResponse( | ||||||||||||
| content={ | ||||||||||||
| "error": { | ||||||||||||
| "message": str(e), | ||||||||||||
| "message": message, | ||||||||||||
| "type": "invalid_request_error", | ||||||||||||
| "code": 400, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -525,7 +525,12 @@ echo "All workers connected!" | |
| # This driver process is responsible for launching a job on the Ray cluster | ||
| CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1) | ||
| if [[ -n "$COMMAND" ]]; then | ||
| set +e | ||
| srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND" | ||
| driver_exit_code=$? | ||
| set -e | ||
| touch "$LOG_DIR/ENDED" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's separate this one out to another PR. It seems pretty unrelated to the rest |
||
| exit "$driver_exit_code" | ||
| else | ||
| echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:" | ||
| cat <<EOF >$SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nemo_gym.py:42These fixes currently ship without tests. Two are pure, CPU-only logic and would be cheap to cover:
_ensure_nemo_gym_package_precedence— monkeypatchsys.path/sys.modulesand point at a tempgym_init; assert the namespace-package shadow is purged, a real package (with__file__/PARENT_DIR) is preserved, a missing submodule is a no-op, and the call is idempotent. Home:tests/unit/environments/test_nemo_gym.py.ValueError→400 filter — a context-lengthValueError(message containingmax_model_len) → 400, a genericValueError→ re-raise, and aVLLMValidationErrorwithout those substrings → 400 (this last case guards the regression flagged in the other comment).tests/unit/models/generation/test_vllm_generation.py:240already stubsVLLMValidationError(note: the stub there subclassesException, notValueError— it should subclassValueErrorto mirror the real class).Since the
sys.path/sys.modulesmanipulation is the subtlest change here, a unit test for (1) would be especially valuable before merge.