From 7143ebf93b9c865f86bd5faf227de9ea55a937b5 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <oyilmaz@nvidia.com>
Date: Thu, 11 Jun 2026 15:09:44 -0400
Subject: [PATCH] Addressing new inference updates in mcore

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 nemo_deploy/llm/inference/inference_base.py   | 74 ++++++++-----------
 nemo_deploy/llm/megatronllm_deployable.py     | 34 ++++-----
 .../functional_tests/utils/run_nemo_export.py |  6 +-
 .../deploy/test_etp_sequence_parallel.py      | 50 ++++---------
 .../unit_tests/deploy/test_inference_base.py  | 61 ++++++---------
 .../test_megatron_multimodal_deployable.py    | 14 ++--
 .../deploy/test_megatronllm_deployable.py     | 51 +++++++------
 7 files changed, 119 insertions(+), 171 deletions(-)

diff --git a/nemo_deploy/llm/inference/inference_base.py b/nemo_deploy/llm/inference/inference_base.py
index 9c8681441..b2f16d4fd 100644
--- a/nemo_deploy/llm/inference/inference_base.py
+++ b/nemo_deploy/llm/inference/inference_base.py
@@ -27,14 +27,8 @@
     get_default_load_sharded_strategy,
 )
 from megatron.core.dist_checkpointing.validation import StrictHandling
-from megatron.core.inference.contexts.static_context import StaticInferenceContext
-from megatron.core.inference.engines.mcore_engine import MCoreEngine
-from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
-    GPTInferenceWrapper,
-)
-from megatron.core.inference.text_generation_controllers.text_generation_controller import (
-    TextGenerationController,
-)
+from megatron.core.inference.apis import MegatronLLM
+from megatron.core.inference.config import InferenceConfig
 from megatron.core.transformer.enums import AttnBackend
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import MLATransformerConfig
@@ -390,27 +384,24 @@ def setup_model_and_tokenizer_for_inference(
 
 
 class MCoreEngineWithCleanup:
-    """Wrapper around MCoreEngine that ensures proper cleanup of distributed resources.
+    """Wrapper around MegatronLLM that ensures proper cleanup of distributed resources.
 
-    This class delegates all operations to the underlying MCoreEngine while ensuring that
-    distributed resources are properly cleaned up when the engine is destroyed.
+    This class delegates all operations to the underlying MegatronLLM engine while ensuring
+    that distributed resources are properly cleaned up when the engine is destroyed.
     """
 
     def __init__(
         self,
-        mcore_engine: MCoreEngine,
-        model_inference_wrapper: GPTInferenceWrapper,
+        llm: MegatronLLM,
         tokenizer: Union[MCoreTokenizerWrappper, MegatronTokenizer],
     ):
         """Initialize the MCoreEngineWithCleanup.
 
         Args:
-            mcore_engine (MCoreEngine): The underlying MCoreEngine instance
-            model_inference_wrapper (GPTInferenceWrapper): The model inference wrapper
+            llm (MegatronLLM): The underlying MegatronLLM instance
             tokenizer (Union[MCoreTokenizerWrappper, MegatronTokenizer]): The tokenizer instance
         """
-        self.mcore_engine = mcore_engine
-        self.model_inference_wrapper = model_inference_wrapper
+        self.mcore_engine = llm
         self.tokenizer = tokenizer
 
     def __del__(self):
@@ -446,8 +437,8 @@ def create_mcore_engine(
     buffer_size_gb: float = 10.0,
     legacy_model_format: bool = False,
     **model_config_kwargs,
-) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
-    """Set up the model, tokenizer and MCoreEngine for inference.
+) -> Tuple[MCoreEngineWithCleanup, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
+    """Set up the model, tokenizer and MegatronLLM engine for inference.
 
     Args:
         path (Path): Path to the checkpoint file
@@ -455,7 +446,7 @@ def create_mcore_engine(
         inference_batch_times_seqlen_threshold (int): Threshold for batch size times sequence length
         inference_max_seq_length (int): Maximum sequence length for inference
         max_batch_size (int): Maximum batch size for inference
-        random_seed (Optional[int]): Random seed for reproducibility
+        random_seed (Optional[int]): Random seed for reproducibility (set globally during init)
         tensor_model_parallel_size (Optional[int]): Size of tensor model parallelism
         pipeline_model_parallel_size (Optional[int]): Size of pipeline model parallelism
         context_parallel_size (Optional[int]): Size of context parallelism
@@ -466,11 +457,10 @@ def create_mcore_engine(
         model_type (str): Type of model to load (default: "gpt")
         model_format (str): Format of model to load (default: "nemo")
         micro_batch_size (Optional[int]): Micro batch size for model execution
-        legacy_model_format (bool): Whether to use the legacy StaticInferenceEngine path in MCoreEngine (default: False)
+        legacy_model_format (bool): Deprecated; no longer used (DynamicInferenceEngine is always used)
     Returns:
-        Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: Tuple containing:
+        Tuple[MCoreEngineWithCleanup, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: Tuple containing:
             - MCoreEngineWithCleanup: Engine for text generation with proper cleanup
-            - GPTInferenceWrapper: Inference-wrapped model
             - Union[MCoreTokenizerWrappper, MegatronTokenizer]: Tokenizer instance
     """
     # Default to 1 for any parallelism dimension that's None
@@ -512,34 +502,32 @@ def create_mcore_engine(
     else:
         raise ValueError(f"Model format {model_format} not supported.")
 
-    # MLA models require block_size_tokens=64 for the dynamic engine, which is not
-    # configurable in the current Megatron-LM version. Fall back to the legacy static
-    # engine so MLA inference works correctly without touching Megatron-LM.
+    model.eval()
+
+    # MLA models require block_size_tokens=64 for correct KV cache operation with the
+    # dynamic inference engine. Set the attention backend to flash if not already set.
+    block_size_tokens = 256
     model_config = getattr(model, "config", None)
     if isinstance(model_config, MLATransformerConfig):
-        legacy_model_format = True
-        # The legacy static engine requires an explicit attention backend.
-        # MLA models use flash attention (attention_mask is handled internally).
+        block_size_tokens = 64
         if not model_config.attention_backend:
             model_config.attention_backend = AttnBackend.flash
 
-    inference_context = StaticInferenceContext(
-        max_batch_size=max_batch_size,
+    inference_config = InferenceConfig(
         max_sequence_length=inference_max_seq_length,
+        buffer_size_gb=int(buffer_size_gb),
+        max_requests=max_batch_size,
+        block_size_tokens=block_size_tokens,
+        materialize_only_last_token_logits=True,
     )
-    model_inference_wrapper = GPTInferenceWrapper(model, inference_context)
-    text_generation_controller = TextGenerationController(
-        inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer
-    )
-    mcore_engine = MCoreEngine(
-        text_generation_controller=text_generation_controller,
-        max_batch_size=max_batch_size,
-        random_seed=random_seed,
-        buffer_size_gb=buffer_size_gb,
-        legacy=legacy_model_format,
+
+    llm = MegatronLLM(
+        model=model,
+        tokenizer=tokenizer,
+        inference_config=inference_config,
     )
 
     # Wrap the engine to ensure cleanup
-    wrapped_engine = MCoreEngineWithCleanup(mcore_engine, model_inference_wrapper, tokenizer)
+    wrapped_engine = MCoreEngineWithCleanup(llm, tokenizer)
 
-    return wrapped_engine, model_inference_wrapper, tokenizer
+    return wrapped_engine, tokenizer
diff --git a/nemo_deploy/llm/megatronllm_deployable.py b/nemo_deploy/llm/megatronllm_deployable.py
index ccbeddb12..10910ac90 100755
--- a/nemo_deploy/llm/megatronllm_deployable.py
+++ b/nemo_deploy/llm/megatronllm_deployable.py
@@ -21,8 +21,8 @@
 import torch
 import torch.distributed
 from jinja2 import Template
-from megatron.core.inference.common_inference_params import CommonInferenceParams
-from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.inference_request import DynamicInferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 
 from nemo_deploy import ITritonDeployable
 from nemo_deploy.llm.inference.inference_base import create_mcore_engine
@@ -113,7 +113,7 @@ def __init__(
         if model_type not in ["gpt", "mamba"]:
             raise ValueError(f"Model type {model_type} not supported for Megatron models.")
 
-        self.mcore_engine, self.inference_wrapped_model, self.mcore_tokenizer = create_mcore_engine(
+        self.mcore_engine, self.mcore_tokenizer = create_mcore_engine(
             num_devices=num_devices,
             num_nodes=num_nodes,
             path=Path(megatron_checkpoint_filepath),
@@ -144,18 +144,18 @@ def __init__(
     def generate(
         self,
         prompts: List[str],
-        inference_params: Optional[CommonInferenceParams] = None,
-    ) -> List[InferenceRequest]:
+        inference_params: Optional[SamplingParams] = None,
+    ) -> List[DynamicInferenceRequest]:
         """Generates text based on the provided input prompts.
 
         Args:
             prompts (List[str]): A list of input strings.
-            inference_params (Optional[CommonInferenceParams]): Parameters for controlling the inference process.
+            inference_params (Optional[SamplingParams]): Parameters for controlling the inference process.
 
         Returns:
-            List[InferenceRequest]: A list containing the generated results.
+            List[DynamicInferenceRequest]: A list containing the generated results.
         """
-        inference_params = inference_params or CommonInferenceParams()
+        inference_params = inference_params or SamplingParams()
 
         # Store the original number of prompts
         orig_num_prompts = len(prompts)
@@ -173,8 +173,7 @@ def generate(
 
             results = self.mcore_engine.generate(
                 prompts=padded_prompts,
-                add_BOS=False,
-                common_inference_params=inference_params,
+                sampling_params=inference_params,
             )
 
             # Only return results for the original prompts
@@ -182,8 +181,7 @@ def generate(
         else:
             results = self.mcore_engine.generate(
                 prompts=prompts,
-                add_BOS=False,
-                common_inference_params=inference_params,
+                sampling_params=inference_params,
             )
             return list(results)
 
@@ -198,7 +196,7 @@ def generate_other_ranks(self):
                     data=[None], src=0
                 )
 
-                inference_params = CommonInferenceParams(
+                inference_params = SamplingParams(
                     temperature=temperature,
                     top_k=int(top_k),
                     top_p=float(top_p),
@@ -208,7 +206,7 @@ def generate_other_ranks(self):
                 )
 
                 if log_probs:
-                    dynamic_engine = getattr(self.mcore_engine, "dynamic_engine", None)
+                    dynamic_engine = getattr(self.mcore_engine, "engine", None)
                     if dynamic_engine is not None:
                         dynamic_engine.materialize_only_last_token_logits = False
                         dynamic_engine.context.config.materialize_only_last_token_logits = False
@@ -419,15 +417,15 @@ def _infer_fn(
                 )
 
         # cast top_k,top_p to native int, float since typecheck assert statements added in MCore0.13 error otherwise
-        # return_prompt_top_n_logprobs returns top_logprobs for prompt tokens too when top_logprobs>0.
-        inference_params = CommonInferenceParams(
+        # skip_prompt_log_probs=False (default) includes prompt tokens in top-N logprobs when top_logprobs>0.
+        inference_params = SamplingParams(
             temperature=temperature,
             top_k=int(top_k),
             top_p=float(top_p),
             num_tokens_to_generate=num_tokens_to_generate,
             return_log_probs=log_probs,
             top_n_logprobs=top_logprobs,
-            return_prompt_top_n_logprobs=bool(top_logprobs),
+            skip_prompt_log_probs=not bool(top_logprobs),
             stop_words=stop_words,
         )
 
@@ -436,7 +434,7 @@ def _infer_fn(
         # (prompt log probs are required for logprob eval benchmarks).
         # Toggle it on both the engine and the context config (controls the
         # model forward pass and log prob calculations).
-        dynamic_engine = getattr(self.mcore_engine, "dynamic_engine", None)
+        dynamic_engine = getattr(self.mcore_engine, "engine", None)
         needs_all_logits = log_probs or bool(top_logprobs)
         if dynamic_engine is not None and needs_all_logits:
             dynamic_engine.materialize_only_last_token_logits = False
diff --git a/tests/functional_tests/utils/run_nemo_export.py b/tests/functional_tests/utils/run_nemo_export.py
index ce6938299..521be9255 100644
--- a/tests/functional_tests/utils/run_nemo_export.py
+++ b/tests/functional_tests/utils/run_nemo_export.py
@@ -35,13 +35,13 @@
 
 in_framework_supported = True
 try:
-    from megatron.core.inference.common_inference_params import CommonInferenceParams
+    from megatron.core.inference.sampling_params import SamplingParams
 
     from nemo_deploy.llm import NemoQueryLLMPyTorch
     from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable
 except Exception as e:
     LOGGER.warning(
-        "Cannot import MegatronLLMDeployable class, or NemoQueryLLMPyTorch, or CommonInferenceParams, "
+        "Cannot import MegatronLLMDeployable class, or NemoQueryLLMPyTorch, or SamplingParams, "
         f"in-framework inference will not be available. Reason: {type(e).__name__}: {e}"
     )
     in_framework_supported = False
@@ -98,7 +98,7 @@ def get_accuracy_with_lambada(model, nq, lora_uids, test_data_path, use_vllm: bo
                 if in_framework_supported and isinstance(model, MegatronLLMDeployable):
                     model_output = model.generate(
                         prompts=[prompt],
-                        inference_params=CommonInferenceParams(
+                        inference_params=SamplingParams(
                             temperature=0.1,
                             top_k=1,
                             top_p=0.0,
diff --git a/tests/unit_tests/deploy/test_etp_sequence_parallel.py b/tests/unit_tests/deploy/test_etp_sequence_parallel.py
index 732f2d067..6cdc1069f 100644
--- a/tests/unit_tests/deploy/test_etp_sequence_parallel.py
+++ b/tests/unit_tests/deploy/test_etp_sequence_parallel.py
@@ -389,19 +389,14 @@ class TestCreateMcoreEngineETPSequenceParallel(unittest.TestCase):
     """Tests that create_mcore_engine handles ETP/SP defaults and passes them down."""
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    def test_etp_defaults_to_1_when_none(
-        self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup
-    ):
+    def test_etp_defaults_to_1_when_none(self, mock_cleanup, mock_llm_cls, mock_setup):
         """expert_tensor_parallel_size=None is normalised to 1 before forwarding."""
         from nemo_deploy.llm.inference.inference_base import create_mcore_engine
 
         mock_setup.return_value = ([MagicMock()], MagicMock())
-        mock_engine_cls.return_value = MagicMock()
+        mock_llm_cls.return_value = MagicMock()
         mock_cleanup.return_value = MagicMock()
 
         create_mcore_engine(path=Path("/fake"), model_format="nemo", expert_tensor_parallel_size=None)
@@ -410,19 +405,14 @@ def test_etp_defaults_to_1_when_none(
         assert kwargs["expert_tensor_parallel_size"] == 1
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    def test_sp_defaults_to_1_when_none(
-        self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup
-    ):
+    def test_sp_defaults_to_1_when_none(self, mock_cleanup, mock_llm_cls, mock_setup):
         """sequence_parallel=None is normalised to 1 before forwarding."""
         from nemo_deploy.llm.inference.inference_base import create_mcore_engine
 
         mock_setup.return_value = ([MagicMock()], MagicMock())
-        mock_engine_cls.return_value = MagicMock()
+        mock_llm_cls.return_value = MagicMock()
         mock_cleanup.return_value = MagicMock()
 
         create_mcore_engine(path=Path("/fake"), model_format="nemo", sequence_parallel=None)
@@ -431,19 +421,14 @@ def test_sp_defaults_to_1_when_none(
         assert kwargs["sequence_parallel"] == 1
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    def test_explicit_etp_passed_through(
-        self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup
-    ):
+    def test_explicit_etp_passed_through(self, mock_cleanup, mock_llm_cls, mock_setup):
         """An explicit expert_tensor_parallel_size value is forwarded unchanged."""
         from nemo_deploy.llm.inference.inference_base import create_mcore_engine
 
         mock_setup.return_value = ([MagicMock()], MagicMock())
-        mock_engine_cls.return_value = MagicMock()
+        mock_llm_cls.return_value = MagicMock()
         mock_cleanup.return_value = MagicMock()
 
         create_mcore_engine(path=Path("/fake"), model_format="nemo", expert_tensor_parallel_size=4)
@@ -452,19 +437,14 @@ def test_explicit_etp_passed_through(
         assert kwargs["expert_tensor_parallel_size"] == 4
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    def test_explicit_sp_passed_through(
-        self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup
-    ):
+    def test_explicit_sp_passed_through(self, mock_cleanup, mock_llm_cls, mock_setup):
         """An explicit sequence_parallel=True value is forwarded unchanged."""
         from nemo_deploy.llm.inference.inference_base import create_mcore_engine
 
         mock_setup.return_value = ([MagicMock()], MagicMock())
-        mock_engine_cls.return_value = MagicMock()
+        mock_llm_cls.return_value = MagicMock()
         mock_cleanup.return_value = MagicMock()
 
         create_mcore_engine(path=Path("/fake"), model_format="nemo", sequence_parallel=True)
@@ -487,7 +467,7 @@ def test_expert_tensor_parallel_size_forwarded(self, mock_create):
         """expert_tensor_parallel_size is forwarded to create_mcore_engine."""
         from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable
 
-        mock_create.return_value = (MagicMock(), MagicMock(), MagicMock())
+        mock_create.return_value = (MagicMock(), MagicMock())
 
         MegatronLLMDeployable(
             megatron_checkpoint_filepath="model.ckpt",
@@ -503,7 +483,7 @@ def test_sequence_parallel_forwarded(self, mock_create):
         """sequence_parallel is forwarded to create_mcore_engine."""
         from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable
 
-        mock_create.return_value = (MagicMock(), MagicMock(), MagicMock())
+        mock_create.return_value = (MagicMock(), MagicMock())
 
         MegatronLLMDeployable(
             megatron_checkpoint_filepath="model.ckpt",
@@ -519,7 +499,7 @@ def test_defaults_etp_1_and_sp_false(self, mock_create):
         """Defaults: expert_tensor_parallel_size=1, sequence_parallel=False."""
         from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable
 
-        mock_create.return_value = (MagicMock(), MagicMock(), MagicMock())
+        mock_create.return_value = (MagicMock(), MagicMock())
 
         MegatronLLMDeployable(megatron_checkpoint_filepath="model.ckpt")
 
diff --git a/tests/unit_tests/deploy/test_inference_base.py b/tests/unit_tests/deploy/test_inference_base.py
index d80c7dd2c..c35f4edea 100644
--- a/tests/unit_tests/deploy/test_inference_base.py
+++ b/tests/unit_tests/deploy/test_inference_base.py
@@ -19,10 +19,7 @@
 
 import pytest
 import torch
-from megatron.core.inference.engines.mcore_engine import MCoreEngine
-from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
-    GPTInferenceWrapper,
-)
+from megatron.core.inference.apis import MegatronLLM
 from megatron.core.transformer.module import MegatronModule
 
 from nemo_deploy.llm.inference.inference_base import (
@@ -322,33 +319,31 @@ def test_setup_model_and_tokenizer_not_dist_ckpt(
         mock_check_dist.assert_called_once()
 
     def test_mcore_engine_with_cleanup(self):
-        # Create mocks for the engine and wrapper
-        mock_engine = MagicMock(spec=MCoreEngine)
-        mock_wrapper = MagicMock(spec=GPTInferenceWrapper)
+        # Create mock for the LLM engine
+        mock_llm = MagicMock(spec=MegatronLLM)
 
         # Create the wrapper
-        engine_wrapper = MCoreEngineWithCleanup(mock_engine, mock_wrapper, self.mock_tokenizer)
+        engine_wrapper = MCoreEngineWithCleanup(mock_llm, self.mock_tokenizer)
 
         # Test attribute delegation - mock the attribute access directly instead of using __getattr__
         # Define the attribute directly on the mock
-        mock_engine.some_attribute = "attribute_value"
+        mock_llm.some_attribute = "attribute_value"
         attribute_value = engine_wrapper.some_attribute
         self.assertEqual(attribute_value, "attribute_value")
 
         # Test method delegation - create a method on the mock
-        mock_engine.some_method = MagicMock(return_value="method_result")
+        mock_llm.some_method = MagicMock(return_value="method_result")
         result = engine_wrapper.some_method()
         self.assertEqual(result, "method_result")
-        mock_engine.some_method.assert_called_once()
+        mock_llm.some_method.assert_called_once()
 
     @patch("nemo_deploy.llm.inference.inference_base.cleanup_distributed")
     def test_mcore_engine_with_cleanup_del(self, mock_cleanup):
-        # Create mocks
-        mock_engine = MagicMock(spec=MCoreEngine)
-        mock_wrapper = MagicMock(spec=GPTInferenceWrapper)
+        # Create mock for the LLM engine
+        mock_llm = MagicMock(spec=MegatronLLM)
 
         # Create the wrapper
-        engine_wrapper = MCoreEngineWithCleanup(mock_engine, mock_wrapper, self.mock_tokenizer)
+        engine_wrapper = MCoreEngineWithCleanup(mock_llm, self.mock_tokenizer)
 
         # Call __del__
         engine_wrapper.__del__()
@@ -781,26 +776,20 @@ def test_setup_model_and_tokenizer_model_config_kwargs(
         self.assertEqual(self.model_config.hidden_size, 1024)
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     def test_create_mcore_engine_nemo_format(
         self,
-        mock_mcore_engine,
-        mock_text_ctrl,
-        mock_gpt_wrapper,
-        mock_static_ctx,
+        mock_megatron_llm,
         mock_setup,
     ):
         """Test create_mcore_engine with nemo model_format."""
         mock_model = MagicMock()
         mock_tokenizer = MagicMock()
         mock_setup.return_value = ([mock_model], mock_tokenizer)
-        mock_engine_instance = MagicMock()
-        mock_mcore_engine.return_value = mock_engine_instance
+        mock_llm_instance = MagicMock()
+        mock_megatron_llm.return_value = mock_llm_instance
 
-        engine, wrapper, tokenizer = create_mcore_engine(
+        engine, tokenizer = create_mcore_engine(
             path=self.mock_path,
             model_format="nemo",
             inference_max_seq_length=2048,
@@ -808,20 +797,14 @@ def test_create_mcore_engine_nemo_format(
         )
 
         mock_setup.assert_called_once()
-        mock_mcore_engine.assert_called_once()
+        mock_megatron_llm.assert_called_once()
         self.assertIsNotNone(engine)
 
     @patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference")
-    @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
-    @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
-    @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
-    @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
+    @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM")
     def test_create_mcore_engine_megatron_format(
         self,
-        mock_mcore_engine,
-        mock_text_ctrl,
-        mock_gpt_wrapper,
-        mock_static_ctx,
+        mock_megatron_llm,
         mock_setup,
     ):
         """Test create_mcore_engine with megatron model_format."""
@@ -829,10 +812,10 @@ def test_create_mcore_engine_megatron_format(
         mock_tokenizer = MagicMock()
         mock_mlm_args = MagicMock()
         mock_setup.return_value = ([mock_model], mock_tokenizer, mock_mlm_args)
-        mock_engine_instance = MagicMock()
-        mock_mcore_engine.return_value = mock_engine_instance
+        mock_llm_instance = MagicMock()
+        mock_megatron_llm.return_value = mock_llm_instance
 
-        engine, wrapper, tokenizer = create_mcore_engine(
+        engine, tokenizer = create_mcore_engine(
             path=self.mock_path,
             model_format="megatron",
             inference_max_seq_length=2048,
@@ -840,7 +823,7 @@ def test_create_mcore_engine_megatron_format(
         )
 
         mock_setup.assert_called_once()
-        mock_mcore_engine.assert_called_once()
+        mock_megatron_llm.assert_called_once()
         self.assertIsNotNone(engine)
 
     @patch("nemo_deploy.llm.inference.inference_base.torch_distributed_init")
diff --git a/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py
index 666e4779c..81a6c760e 100644
--- a/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py
+++ b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 import torch
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 from PIL import Image
 
 from nemo_deploy.multimodal.megatron_multimodal_deployable import MegatronMultimodalDeployable
@@ -157,7 +157,7 @@ def test_generate_method(self, deployable, sample_image):
         """Test the generate method."""
         prompts = ["Test prompt 1", "Test prompt 2"]
         images = [sample_image, sample_image]
-        inference_params = CommonInferenceParams(temperature=0.7, top_k=10, top_p=0.9, num_tokens_to_generate=100)
+        inference_params = SamplingParams(temperature=0.7, top_k=10, top_p=0.9, num_tokens_to_generate=100)
 
         with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate:
             with patch.object(deployable, "apply_chat_template", side_effect=lambda x: x):
@@ -282,8 +282,8 @@ def test_infer_fn(self, deployable, sample_image_base64, sample_image):
                 assert call_args[0][0] == prompts
                 # Images should be converted from base64
                 assert len(call_args[0][1]) == 2
-                # Check that inference_params is a CommonInferenceParams object (3rd positional arg)
-                assert isinstance(call_args[0][2], CommonInferenceParams)
+                # Check that inference_params is a SamplingParams object (3rd positional arg)
+                assert isinstance(call_args[0][2], SamplingParams)
                 assert call_args[0][2].temperature == 0.8
                 assert call_args[0][2].top_k == 20
                 assert call_args[0][2].top_p == 0.95
@@ -318,8 +318,8 @@ def test_infer_fn_default_params(self, deployable, sample_image_base64, sample_i
                 assert call_args[0][0] == prompts
                 # Images should be converted from base64
                 assert len(call_args[0][1]) == 1
-                # Check that inference_params is a CommonInferenceParams object (3rd positional arg)
-                assert isinstance(call_args[0][2], CommonInferenceParams)
+                # Check that inference_params is a SamplingParams object (3rd positional arg)
+                assert isinstance(call_args[0][2], SamplingParams)
                 assert call_args[0][2].temperature == 1.0
                 assert call_args[0][2].top_k == 1
                 assert call_args[0][2].top_p == 0.0
@@ -357,7 +357,7 @@ def test_infer_fn_with_temperature_zero(self, deployable):
                 call_args = mock_generate.call_args
 
                 # Check that inference_params has greedy sampling parameters
-                assert isinstance(call_args[0][2], CommonInferenceParams)
+                assert isinstance(call_args[0][2], SamplingParams)
                 assert call_args[0][2].temperature == 0.0  # Kept as 0.0
                 assert call_args[0][2].top_k == 1  # Overridden for greedy sampling
                 assert call_args[0][2].top_p == 0.0  # Overridden for greedy sampling
diff --git a/tests/unit_tests/deploy/test_megatronllm_deployable.py b/tests/unit_tests/deploy/test_megatronllm_deployable.py
index af6902770..0e1ef4d4f 100644
--- a/tests/unit_tests/deploy/test_megatronllm_deployable.py
+++ b/tests/unit_tests/deploy/test_megatronllm_deployable.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytest
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 
 from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable, dict_to_str
 from nemo_export_deploy_common.import_utils import UnavailableError
@@ -121,7 +121,7 @@ def test_generate_with_cuda_graphs_empty_prompts(deployable):
     deployable.enable_cuda_graphs = True
     deployable.max_batch_size = 4
     prompts = []
-    inference_params = CommonInferenceParams()
+    inference_params = SamplingParams()
 
     with patch.object(deployable.mcore_engine, "generate") as mock_generate:
         mock_generate.return_value = ["", "", "", ""]
@@ -197,13 +197,13 @@ def test_generate_other_ranks_disables_materialize_when_log_probs(deployable):
             [1.0, 1, 0.0, 256, True, None],  # log_probs=True
         ]
 
-        deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True
-        deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True
 
         deployable.generate_other_ranks()
 
-        assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False
-        assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False
 
 
 @pytest.mark.run_only_on("GPU")
@@ -387,7 +387,7 @@ def test_generate_without_cuda_graphs(deployable):
     deployable.enable_cuda_graphs = False
 
     prompts = ["Hello", "World"]
-    inference_params = CommonInferenceParams(
+    inference_params = SamplingParams(
         temperature=1.0,
         top_k=1,
         top_p=0.0,
@@ -403,7 +403,7 @@ def test_generate_without_cuda_graphs(deployable):
 
         results = deployable.generate(prompts, inference_params)
         assert len(results) == 2
-        mock_generate.assert_called_once_with(prompts=prompts, add_BOS=False, common_inference_params=inference_params)
+        mock_generate.assert_called_once_with(prompts=prompts, sampling_params=inference_params)
 
 
 @pytest.mark.run_only_on("GPU")
@@ -414,7 +414,7 @@ def test_generate_with_cuda_graphs(deployable):
     deployable.max_batch_size = 4
 
     prompts = ["Hello", "World"]
-    inference_params = CommonInferenceParams(
+    inference_params = SamplingParams(
         temperature=1.0,
         top_k=1,
         top_p=0.0,
@@ -446,8 +446,7 @@ def test_generate_with_cuda_graphs(deployable):
         called_args = mock_generate.call_args[1]
         assert len(called_args["prompts"]) == 4  # Should pad to max_batch_size
         assert called_args["prompts"][:2] == prompts  # Original prompts should be first
-        assert called_args["add_BOS"] is False
-        assert called_args["common_inference_params"] == inference_params
+        assert called_args["sampling_params"] == inference_params
 
 
 @pytest.mark.run_only_on("GPU")
@@ -607,8 +606,8 @@ def test_init_with_megatron_valid_types(model_type):
         patch("nemo_deploy.llm.megatronllm_deployable.HAVE_TRITON", True),
         patch("nemo_deploy.llm.megatronllm_deployable.create_mcore_engine") as mock_create,
     ):
-        mock_engine, mock_model, mock_tokenizer = MagicMock(), MagicMock(), MagicMock()
-        mock_create.return_value = (mock_engine, mock_model, mock_tokenizer)
+        mock_engine, mock_tokenizer = MagicMock(), MagicMock()
+        mock_create.return_value = (mock_engine, mock_tokenizer)
 
         deployable = MegatronLLMDeployable(
             megatron_checkpoint_filepath="bar.ckpt",
@@ -716,7 +715,7 @@ def test_infer_fn_basic(deployable):
         call_args = mock_generate.call_args[0]
         assert call_args[0] == prompts
 
-        # Verify CommonInferenceParams
+        # Verify SamplingParams
         inference_params = mock_generate.call_args[0][1]
         assert inference_params.temperature == 1.0
         assert inference_params.top_k == 1
@@ -1076,13 +1075,13 @@ def test_infer_fn_disables_materialize_only_last_token_logits_when_log_probs(dep
         mock_tensor_instance.cpu.return_value.detach.return_value.numpy.return_value = np.array([0.1])
         mock_tensor.return_value = mock_tensor_instance
 
-        deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True
-        deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True
 
         deployable._infer_fn(prompts=["Hello"], log_probs=True)
 
-        assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False
-        assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False
 
 
 @pytest.mark.run_only_on("GPU")
@@ -1100,13 +1099,13 @@ def test_infer_fn_disables_materialize_only_last_token_logits_when_top_logprobs(
         mock_remove_eos.return_value = ["text"]
         mock_dict_to_str.return_value = '{"tok": 0.1}'
 
-        deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True
-        deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True
 
         deployable._infer_fn(prompts=["Hello"], top_logprobs=5)
 
-        assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False
-        assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False
+        assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False
 
 
 @pytest.mark.run_only_on("GPU")
@@ -1121,10 +1120,10 @@ def test_infer_fn_keeps_materialize_only_last_token_logits_when_no_logprobs(depl
         mock_generate.return_value = [mock_result]
         mock_remove_eos.return_value = ["text"]
 
-        deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True
-        deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.materialize_only_last_token_logits = True
+        deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True
 
         deployable._infer_fn(prompts=["Hello"], log_probs=False, top_logprobs=0)
 
-        assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is True
-        assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is True
+        assert deployable.mcore_engine.engine.materialize_only_last_token_logits is True
+        assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is True