diff --git a/nemo_gym/openai_utils.py b/nemo_gym/openai_utils.py
index d5c7ebc4a..5408569ad 100644
--- a/nemo_gym/openai_utils.py
+++ b/nemo_gym/openai_utils.py
@@ -560,3 +560,20 @@ async def create_tokenize(self, **kwargs):
 
         await self._raise_for_status(response, request_kwargs)
         return await get_response_json(response)
+
+    async def create_generate(self, **kwargs):
+        # SGLang's native (non-OpenAI) generation endpoint. Used by the SGLang
+        # engine path because, on the pinned SGLang v0.5.10, /v1/chat/completions
+        # does not expose the exact sampled integer token ids (its logprobs.token
+        # is a decoded string and there is no return_tokens_as_token_ids), whereas
+        # /generate with return_logprob=True returns meta_info.output_token_logprobs
+        # whose tuples are (logprob, token_id, ...). Lives at the server root, not /v1.
+        base_url = self.base_url.removesuffix("/v1")
+        request_kwargs = dict(
+            url=f"{base_url}/generate",
+            json=kwargs,
+        )
+        response = await self._request(method="POST", **request_kwargs)
+
+        await self._raise_for_status(response, request_kwargs)
+        return await get_response_json(response)
diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
index a6709218f..b764cd99d 100644
--- a/responses_api_models/vllm_model/app.py
+++ b/responses_api_models/vllm_model/app.py
@@ -15,9 +15,10 @@
 import base64
 import json
 import os
+import re
 from copy import deepcopy
 from time import time
-from typing import Any, ClassVar, Dict, List, Optional, Union
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Tuple, Union
 from uuid import uuid4
 
 from aiohttp.client_exceptions import ClientResponseError
@@ -78,6 +79,26 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig):
     # small without depending on vLLM's ``--allowed-local-media-path``.
     audio_root: Optional[str] = None
 
+    # Generation engine. "vllm" (default) keeps the original OpenAI /v1/chat/completions
+    # marshaling path unchanged. "sglang" switches to SGLang's native /generate endpoint
+    # (see VLLMModel._sglang_chat_completion): on the pinned SGLang v0.5.10 the chat endpoint
+    # cannot return the exact sampled integer token ids (decoded-string logprobs, no
+    # return_tokens_as_token_ids) and /tokenize only accepts a raw prompt string, so the
+    # proxy tokenizes locally and reads token ids from /generate's meta_info.output_token_logprobs.
+    engine: Literal["vllm", "sglang"] = "vllm"
+
+    # Path to the Jinja chat template the SGLang server was launched with (--chat-template).
+    # Used (engine == "sglang") so the proxy renders prompts with the same template the
+    # model was served/trained with. If None, the model tokenizer's built-in template is used.
+    sglang_chat_template_path: Optional[str] = None
+
+    # Max sequence length of the SGLang server (engine == "sglang"). When a request does not
+    # specify a positive max_tokens (e.g. the SWE agent sends max_output_tokens=0 = "unlimited"),
+    # the proxy fills the remaining context as max_new_tokens. Without this, SGLang /generate
+    # falls back to its default max_new_tokens=128, truncating reasoning before </think> and
+    # breaking multi-turn contiguity.
+    sglang_max_total_sequence_length: Optional[int] = None
+
     def model_post_init(self, context):
         if isinstance(self.base_url, str):
             self.base_url = [self.base_url]
@@ -115,6 +136,18 @@ def _post_init(self) -> None:
 
         self._converter = self.get_converter()
 
+        # Lazily-initialised state for the SGLang engine path (see _sglang_chat_completion).
+        self._sglang_tokenizer: Any = None
+        self._sglang_chat_template: Optional[str] = None
+        # Contiguity fix: per-session running token sequence. Each multi-turn rollout's prompt
+        # is built by splicing the prior assistant turn's EXACT sampled generation_token_ids
+        # (never re-tokenizing them), so nemo_gym.py's `seen == prompt[:len(seen)]` holds by
+        # construction. Re-tokenizing prior turns broke this two ways: proxy parse drift
+        # (dropped multi-line tool calls, mangled </think>) and BPE retokenization (identical
+        # text, different token split). Keyed by SESSION_ID_KEY; cache-miss -> full tokenize.
+        self._sglang_session_seq: Dict[str, Dict[str, Any]] = dict()
+        self._sglang_eos_nl_ids: Optional[List[int]] = None
+
     async def responses(
         self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming = Body()
     ) -> NeMoGymResponse:
@@ -436,6 +469,14 @@ async def chat_completions(
         self, request: Request, body: NeMoGymChatCompletionCreateParamsNonStreaming = Body()
     ) -> NeMoGymChatCompletion:
         body_dict = body.model_dump(exclude_unset=True)
+
+        # SGLang engine path: handled entirely by _sglang_chat_completion, which renders the
+        # prompt locally (keeping <think> embedded in assistant content, as the SWE chat
+        # template expects) and generates via /generate. Dispatched BEFORE the vLLM-specific
+        # _preprocess (which would split reasoning out of assistant content for the chat API).
+        if self.config.engine == "sglang":
+            return await self._sglang_chat_completion(request, body_dict)
+
         body_dict = self._preprocess_chat_completion_create_params(request, body_dict)
 
         client = self._resolve_client(request)
@@ -584,6 +625,332 @@ def _resolve_client(self, request: Request) -> NeMoGymAsyncOpenAI:
 
         return client
 
+    # =======================================================
+    # SGLang engine path (see VLLMModelConfig.engine == "sglang")
+    # =======================================================
+
+    # Hermes tool-call format emitted by SGLang's --tool-call-parser hermes and the SWE
+    # chat template: one or more <tool_call>\n{"name": ..., "arguments": ...}\n</tool_call>
+    # blocks. The capture is the JSON object, anchored by the closing tag (so nested braces
+    # in arguments are handled without brace-balancing).
+    _SGLANG_TOOL_CALL_PATTERN: ClassVar = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
+    _SGLANG_ARGS_PATTERN: ClassVar = re.compile(r"\"arguments\"\s*:\s*(.*)\}\s*$", re.DOTALL)
+    # Turn-terminating special tokens the chat template re-emits after an assistant message.
+    # We strip them from the decoded generation so they are not doubled on history re-render.
+    _SGLANG_EOS_MARKERS: ClassVar = ("<|im_end|>", "<|endoftext|>")
+
+    def _get_sglang_tokenizer(self) -> Any:
+        if self._sglang_tokenizer is None:
+            from transformers import AutoTokenizer
+
+            self._sglang_tokenizer = AutoTokenizer.from_pretrained(self.config.model)
+        return self._sglang_tokenizer
+
+    def _get_sglang_chat_template(self) -> Optional[str]:
+        if self._sglang_chat_template is None and self.config.sglang_chat_template_path:
+            with open(self.config.sglang_chat_template_path) as f:
+                self._sglang_chat_template = f.read()
+        return self._sglang_chat_template
+
+    def _full_sglang_tokenize(
+        self, messages: List[Any], tools: Any, chat_template_kwargs: Dict[str, Any]
+    ) -> List[int]:
+        """Tokenize the full chat prompt via the chat template (the original, non-spliced path)."""
+        tokenizer = self._get_sglang_tokenizer()
+        encoded = tokenizer.apply_chat_template(
+            messages,
+            tools=tools,
+            chat_template=self._get_sglang_chat_template(),
+            add_generation_prompt=True,
+            tokenize=True,
+            **chat_template_kwargs,
+        )
+        # transformers v5's apply_chat_template(tokenize=True) returns a BatchEncoding
+        # (dict-like), not a flat list; normalize to a JSON-serializable List[int].
+        if isinstance(encoded, dict) or hasattr(encoded, "input_ids"):
+            encoded = encoded["input_ids"]
+        if hasattr(encoded, "tolist"):
+            encoded = encoded.tolist()
+        if encoded and isinstance(encoded[0], (list, tuple)):
+            encoded = encoded[0]
+        return [int(t) for t in encoded]
+
+    def _sglang_eos_nl(self) -> List[int]:
+        if self._sglang_eos_nl_ids is None:
+            enc = self._get_sglang_tokenizer()("<|im_end|>\n", add_special_tokens=False)
+            self._sglang_eos_nl_ids = [int(t) for t in enc["input_ids"]]
+        return self._sglang_eos_nl_ids
+
+    def _sglang_followup_fragment_ids(
+        self, new_msgs: List[Any], chat_template_kwargs: Dict[str, Any]
+    ) -> Optional[List[int]]:
+        """Token ids for the new (non-assistant) messages + the next generation-prompt header,
+        rendered as a standalone fragment that follows a prior assistant turn. Derived by
+        differencing two template renders against an anchor assistant turn, then tokenizing the
+        suffix. Safe to splice onto the running sequence because the splice boundary is the
+        assistant turn's ``<|im_end|>\\n`` (a special token), across which byte-level BPE does
+        not merge (validated). Returns None if the template is not splice-friendly -> caller
+        falls back to a full re-tokenize."""
+        tokenizer = self._get_sglang_tokenizer()
+        ct = self._get_sglang_chat_template()
+        anchor = [{"role": "assistant", "content": "X"}]
+        try:
+            full = tokenizer.apply_chat_template(
+                anchor + list(new_msgs), tools=None, chat_template=ct,
+                add_generation_prompt=True, tokenize=False, **chat_template_kwargs,
+            )
+            base = tokenizer.apply_chat_template(
+                anchor, tools=None, chat_template=ct,
+                add_generation_prompt=False, tokenize=False, **chat_template_kwargs,
+            )
+        except Exception:
+            return None
+        if not isinstance(full, str) or not isinstance(base, str) or not full.startswith(base):
+            return None
+        enc = tokenizer(full[len(base):], add_special_tokens=False)
+        return [int(t) for t in enc["input_ids"]]
+
+    @staticmethod
+    def _sglang_msg_sig(m: Dict[str, Any]) -> Tuple[Any, str, str]:
+        return (
+            m.get("role"),
+            json.dumps(m.get("content"), sort_keys=True, default=str),
+            json.dumps(m.get("tool_calls"), sort_keys=True, default=str),
+        )
+
+    @classmethod
+    def _sglang_messages_match(cls, a: List[Any], b: List[Any]) -> bool:
+        return len(a) == len(b) and all(
+            cls._sglang_msg_sig(x) == cls._sglang_msg_sig(y) for x, y in zip(a, b)
+        )
+
+    def _build_sglang_prompt_ids(
+        self, request: Request, messages: List[Any], tools: Any,
+        chat_template_kwargs: Dict[str, Any],
+    ) -> Tuple[List[int], Optional[str]]:
+        """Return (prompt_token_ids, session_id). Splices the prior assistant turn's exact
+        generation tokens when this is a continuation of a cached session; else full tokenize."""
+        try:
+            sid = request.session.get(SESSION_ID_KEY)
+        except Exception:
+            sid = None
+        if sid is not None:
+            state = self._sglang_session_seq.get(sid)
+            if state is not None:
+                prev = state["messages"]
+                n = len(prev)
+                if (
+                    len(messages) > n
+                    and messages[n].get("role") == "assistant"
+                    and all(m.get("role") != "assistant" for m in messages[n + 1:])
+                    and self._sglang_messages_match(messages[:n], prev)
+                ):
+                    frag = self._sglang_followup_fragment_ids(messages[n + 1:], chat_template_kwargs)
+                    if frag is not None:
+                        return state["seq"] + frag, sid
+        return self._full_sglang_tokenize(messages, tools, chat_template_kwargs), sid
+
+    def _update_sglang_session_seq(
+        self, sid: Optional[str], messages: List[Any],
+        prompt_token_ids: List[int], generation_token_ids: List[int],
+    ) -> None:
+        """Cache the running sequence through this assistant turn (prompt + gen + ``<|im_end|>\\n``)
+        for the next turn's splice."""
+        if sid is None:
+            return
+        eos_nl = self._sglang_eos_nl()  # e.g. [151645, 198]
+        seq = list(prompt_token_ids) + list(generation_token_ids)
+        if not seq or seq[-1] != eos_nl[0]:
+            seq = seq + eos_nl
+        else:
+            seq = seq + eos_nl[1:]  # gen already ended with <|im_end|>; just add the trailing \n
+        # Bound memory: refresh this sid's insertion order, evict oldest beyond the cap.
+        # Evicted sessions simply fall back to a full tokenize on their next turn (safe).
+        self._sglang_session_seq.pop(sid, None)
+        while len(self._sglang_session_seq) >= 8192:
+            self._sglang_session_seq.pop(next(iter(self._sglang_session_seq)), None)
+        self._sglang_session_seq[sid] = {"messages": list(messages), "seq": seq}
+
+    def _parse_sglang_generation(self, text: str) -> Tuple[Optional[str], str, List[Dict[str, Any]]]:
+        """Reconstruct (reasoning_content, content, tool_calls) from SGLang /generate raw text.
+
+        The qwen3-thinking generation prompt ends with ``<think>\\n``, so the generated text
+        begins INSIDE the reasoning block (no opening ``<think>``). Everything up to the first
+        ``</think>`` is therefore reasoning; hermes tool calls are parsed out of the remainder.
+        This mirrors what the SGLang server's reasoning_parser=qwen3-thinking +
+        tool_call_parser=hermes would have produced on /v1/chat/completions, so downstream
+        Responses marshaling is identical to the vLLM path.
+        """
+        reasoning_content: Optional[str] = None
+        if self.config.uses_reasoning_parser and "</think>" in text:
+            reasoning_content, _, remainder = text.partition("</think>")
+        else:
+            remainder = text
+
+        tool_calls: List[Dict[str, Any]] = []
+        for match in self._SGLANG_TOOL_CALL_PATTERN.finditer(remainder):
+            block = match.group(1)
+            try:
+                parsed = json.loads(block)
+            except json.JSONDecodeError:
+                continue
+            # Preserve the model's EXACT arguments serialization (function.arguments is a JSON
+            # string in the OpenAI schema). Keeping the raw substring -- rather than
+            # re-serializing the parsed dict -- means the chat template re-renders the assistant
+            # turn byte-identically, which the nemo_gym.py contiguity assert depends on.
+            args_match = self._SGLANG_ARGS_PATTERN.search(block)
+            if args_match is not None:
+                arguments = args_match.group(1).strip()
+            else:
+                arguments = json.dumps(parsed.get("arguments", {}))
+            tool_calls.append(
+                dict(
+                    id=f"call_{uuid4().hex}",
+                    type="function",
+                    function=dict(name=parsed.get("name"), arguments=arguments),
+                )
+            )
+
+        content = self._SGLANG_TOOL_CALL_PATTERN.sub("", remainder).strip()
+        return reasoning_content, content, tool_calls
+
+    async def _sglang_chat_completion(
+        self, request: Request, body_dict: Dict[str, Any]
+    ) -> NeMoGymChatCompletion:
+        """SGLang v0.5.10 generation path (see VLLMModelConfig.engine).
+
+        Tokenizes the chat-templated prompt locally and generates via SGLang's native
+        /generate (return_logprob=True) -- the only v0.5.10 source of the exact sampled
+        integer token ids AND their logprobs (needed for token-level RL). The decoded text
+        is re-parsed into reasoning + hermes tool_calls so the returned object is shaped
+        exactly like the vLLM /v1/chat/completions response, keeping every downstream
+        Responses-API conversion identical.
+        """
+        client = self._resolve_client(request)
+
+        messages = body_dict["messages"]
+        if self.config.replace_developer_role_with_system:
+            for message_dict in messages:
+                if message_dict.get("role") == "developer":
+                    message_dict["role"] = "system"
+        tools = body_dict.get("tools")
+
+        # Merge config chat_template_kwargs with per-request metadata overrides (mirrors
+        # _preprocess_chat_completion_create_params so reasoning toggles behave identically).
+        chat_template_kwargs: Dict[str, Any] = {}
+        if self.config.chat_template_kwargs:
+            chat_template_kwargs = deepcopy(self.config.chat_template_kwargs)
+        metadata = body_dict.get("metadata", dict())
+        chat_template_kwargs.update(json.loads(metadata.get("chat_template_kwargs", "{}")))
+
+        tokenizer = self._get_sglang_tokenizer()  # used below to decode generation_token_ids
+        # Build prompt token ids with contiguity-preserving splicing across turns (falls back
+        # to a full chat-template tokenize on the first turn / cache miss / history condensation).
+        prompt_token_ids, _splice_sid = self._build_sglang_prompt_ids(
+            request, messages, tools, chat_template_kwargs
+        )
+
+        # Map the OpenAI sampling knobs onto SGLang /generate sampling_params.
+        # spaces_between_special_tokens=False mirrors the NeMo-RL SGLang backend and keeps
+        # special tokens (</think>, <tool_call>) tight in the decoded text we parse below.
+        sampling_params: Dict[str, Any] = {"spaces_between_special_tokens": False}
+        # max_tokens of None OR 0 means "unlimited" (the SWE agent sends max_output_tokens=0).
+        # SGLang /generate would otherwise fall back to max_new_tokens=128 and truncate reasoning
+        # before </think>; fill the remaining context instead (matches vLLM + the recipe).
+        max_new_tokens = body_dict.get("max_tokens") or None
+        if max_new_tokens is None and self.config.sglang_max_total_sequence_length:
+            # Fill the remaining context, leaving a small margin: SGLang /generate rejects a
+            # request whose input + max_new_tokens >= context_length (it requires strictly less,
+            # unlike vLLM which allows ==). Reserve a few tokens to stay safely under the limit.
+            max_new_tokens = self.config.sglang_max_total_sequence_length - len(prompt_token_ids) - 8
+            max_new_tokens = max(1, max_new_tokens)
+        if max_new_tokens:
+            sampling_params["max_new_tokens"] = max_new_tokens
+        for key in ("temperature", "top_p", "top_k", "stop"):
+            if body_dict.get(key) is not None:
+                sampling_params[key] = body_dict[key]
+
+        gen = await client.create_generate(
+            input_ids=prompt_token_ids,
+            sampling_params=sampling_params,
+            return_logprob=True,
+        )
+
+        meta_info = gen.get("meta_info") or {}
+        # Each tuple is (logprob, token_id, ...). Sourcing both ids and logprobs from the SAME
+        # list guarantees they are 1:1 aligned in count and order -- mirrors
+        # nemo_rl/models/generation/sglang/sglang_generation.py:generate_one_sample.
+        output_token_logprobs = meta_info.get("output_token_logprobs") or []
+        generation_token_ids = [item[1] for item in output_token_logprobs]
+        generation_log_probs = [item[0] for item in output_token_logprobs]
+
+        # Contiguity fix: cache the running token sequence through this assistant turn so the
+        # next turn splices these EXACT generation_token_ids instead of re-tokenizing them.
+        self._update_sglang_session_seq(
+            _splice_sid, messages, prompt_token_ids, generation_token_ids
+        )
+
+        # Decode the EXACT sampled ids with skip_special_tokens=False so reasoning markers
+        # (</think> is a SPECIAL token, id 151668) survive. SGLang's gen["text"] decodes with
+        # skip_special_tokens=True by default, which STRIPS </think> -> the reasoning never gets
+        # wrapped -> the <think> wrapper is dropped on history re-render -> the nemo_gym.py
+        # contiguity assert fires on every multi-turn rollout. spaces_between_special_tokens=False
+        # keeps the markers tight. Then strip the trailing EOS the chat template re-adds.
+        generated_text = tokenizer.decode(
+            generation_token_ids, skip_special_tokens=False, spaces_between_special_tokens=False
+        )
+        _stripped = True
+        while _stripped:
+            _stripped = False
+            generated_text = generated_text.rstrip("\n")
+            for _eos in self._SGLANG_EOS_MARKERS:
+                if generated_text.endswith(_eos):
+                    generated_text = generated_text[: -len(_eos)]
+                    _stripped = True
+        reasoning_content, content, tool_calls = self._parse_sglang_generation(generated_text)
+
+        if (meta_info.get("finish_reason") or {}).get("type") == "length":
+            finish_reason = "length"
+        elif tool_calls:
+            finish_reason = "tool_calls"
+        else:
+            finish_reason = "stop"
+
+        # Re-embed reasoning into <think> tags and prepend to content, identical to the vLLM
+        # reasoning-parser branch, so postprocess_assistant_message_dict re-extracts it.
+        if self.config.uses_reasoning_parser and reasoning_content:
+            content = self._converter._wrap_reasoning_in_think_tags([reasoning_content]) + (content or "")
+
+        message_dict: Dict[str, Any] = dict(
+            role="assistant",
+            content=content or None,
+            tool_calls=tool_calls or None,
+        )
+        if self.config.return_token_id_information:
+            message_dict.update(
+                dict(
+                    prompt_token_ids=prompt_token_ids,
+                    generation_token_ids=generation_token_ids,
+                    generation_log_probs=generation_log_probs,
+                )
+            )
+
+        chat_completion_dict = dict(
+            id=f"chtcmpl-{uuid4().hex}",
+            object="chat.completion",
+            created=int(time()),
+            model=self.config.model,
+            choices=[
+                dict(index=0, finish_reason=finish_reason, message=message_dict, logprobs=None)
+            ],
+            usage=dict(
+                prompt_tokens=len(prompt_token_ids),
+                completion_tokens=len(generation_token_ids),
+                total_tokens=len(prompt_token_ids) + len(generation_token_ids),
+            ),
+        )
+        return NeMoGymChatCompletion.model_validate(chat_completion_dict)
+
 
 if __name__ == "__main__":
     VLLMModel.run_webserver()
diff --git a/responses_api_models/vllm_model/pyproject.toml b/responses_api_models/vllm_model/pyproject.toml
index 8e52b8b7f..35cfe2c76 100644
--- a/responses_api_models/vllm_model/pyproject.toml
+++ b/responses_api_models/vllm_model/pyproject.toml
@@ -19,6 +19,10 @@ version = "0.2.0rc0"
 requires-python = ">=3.12"
 dependencies = [
     "nemo-gym[dev]",
+    # Used by the SGLang engine path (VLLMModelConfig.engine == "sglang") to render the
+    # chat template + tokenize prompts locally. Pinned to match the version the SWE recipe
+    # was validated with; relax/adjust as upstream dependency policy requires.
+    "transformers==5.3.0",
 ]
 
 [build-system]