-
Notifications
You must be signed in to change notification settings - Fork 206
fix: handle vllm context length errors in nano v3 recipe #1752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,16 @@ | |
| from nemo_gym.server_utils import SESSION_ID_KEY, is_nemo_gym_fastapi_entrypoint | ||
|
|
||
|
|
||
| CONTEXT_LENGTH_ERROR_SUBSTRINGS = ( | ||
| "context length", | ||
| "max_model_len", | ||
| "max model len", | ||
| "max_tokens", | ||
| "maximum context length", | ||
| "no room for output tokens", | ||
| ) | ||
|
|
||
|
|
||
| class VLLMModelConfig(BaseResponsesAPIModelConfig): | ||
| base_url: Union[str, List[str]] | ||
| api_key: str | ||
|
|
@@ -432,6 +442,22 @@ def _preprocess_chat_completion_create_params(self, request: Request, body_dict: | |
|
|
||
| return body_dict | ||
|
|
||
| @staticmethod | ||
| def _is_context_length_error(error: ClientResponseError) -> bool: | ||
| if error.status != 400: | ||
| return False | ||
|
|
||
| response_content = getattr(error, "response_content", b"") | ||
| if isinstance(response_content, bytes): | ||
| response_content_text = response_content.decode(errors="replace") | ||
| elif response_content is None: | ||
| response_content_text = "" | ||
| else: | ||
| response_content_text = str(response_content) | ||
|
|
||
| error_text = f"{error.message} {response_content_text}".lower() | ||
| return any(substring in error_text for substring in CONTEXT_LENGTH_ERROR_SUBSTRINGS) | ||
|
Comment on lines
+450
to
+459
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good defensive coding. The old code did |
||
|
|
||
| async def chat_completions( | ||
| self, request: Request, body: NeMoGymChatCompletionCreateParamsNonStreaming = Body() | ||
| ) -> NeMoGymChatCompletion: | ||
|
|
@@ -461,12 +487,7 @@ async def chat_completions( | |
| 3. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L948 | ||
| 4. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/sampling_params.py#L463 | ||
| """ | ||
| result_content_str = e.response_content.decode() | ||
|
|
||
| is_out_of_context_length = e.status == 400 and ( | ||
| "context length" in result_content_str or "max_tokens" in result_content_str | ||
| ) | ||
| if is_out_of_context_length: | ||
| if self._is_context_length_error(e): | ||
| res = self._create_empty_chat_completion() | ||
| res.choices[0].finish_reason = "length" | ||
| return res | ||
|
|
@@ -529,7 +550,14 @@ async def chat_completions( | |
| tokenize_body_dict[key] = body_dict[key] | ||
|
|
||
| # The base url has /v1 at the end but vLLM's tokenize endpoint does not have v1, hence the .. | ||
| tokenize_response = await client.create_tokenize(**tokenize_body_dict) | ||
| try: | ||
| tokenize_response = await client.create_tokenize(**tokenize_body_dict) | ||
| except ClientResponseError as e: | ||
| if self._is_context_length_error(e): | ||
| res = self._create_empty_chat_completion() | ||
| res.choices[0].finish_reason = "length" | ||
| return res | ||
| raise | ||
|
Comment on lines
+553
to
+560
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTE — When the tokenize call fails here, a successful chat completion (line 477) is silently discarded. This is the correct behavior: without For pure evaluation ( |
||
| """ | ||
| END | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NOTE —
"max_tokens"is the broadest substring here. A vLLM 400 whose body mentionsmax_tokensfor a non-context-length reason (e.g., invalid value, type mismatch) would be silently swallowed and surfaced asfinish_reason="length"instead of raising. This was already true in the old code so it's not a regression, but worth keeping in mind — if mysterious "length" finishes appear in logs, this matcher could be the culprit.Not blocking; the substring approach is the pragmatic choice given vLLM's unstructured error surface.