From 7143ebf93b9c865f86bd5faf227de9ea55a937b5 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Thu, 11 Jun 2026 15:09:44 -0400 Subject: [PATCH] Addressing new inference updates in mcore Signed-off-by: Onur Yilmaz --- nemo_deploy/llm/inference/inference_base.py | 74 ++++++++----------- nemo_deploy/llm/megatronllm_deployable.py | 34 ++++----- .../functional_tests/utils/run_nemo_export.py | 6 +- .../deploy/test_etp_sequence_parallel.py | 50 ++++--------- .../unit_tests/deploy/test_inference_base.py | 61 ++++++--------- .../test_megatron_multimodal_deployable.py | 14 ++-- .../deploy/test_megatronllm_deployable.py | 51 +++++++------ 7 files changed, 119 insertions(+), 171 deletions(-) diff --git a/nemo_deploy/llm/inference/inference_base.py b/nemo_deploy/llm/inference/inference_base.py index 9c8681441..b2f16d4fd 100644 --- a/nemo_deploy/llm/inference/inference_base.py +++ b/nemo_deploy/llm/inference/inference_base.py @@ -27,14 +27,8 @@ get_default_load_sharded_strategy, ) from megatron.core.dist_checkpointing.validation import StrictHandling -from megatron.core.inference.contexts.static_context import StaticInferenceContext -from megatron.core.inference.engines.mcore_engine import MCoreEngine -from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( - GPTInferenceWrapper, -) -from megatron.core.inference.text_generation_controllers.text_generation_controller import ( - TextGenerationController, -) +from megatron.core.inference.apis import MegatronLLM +from megatron.core.inference.config import InferenceConfig from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.transformer_config import MLATransformerConfig @@ -390,27 +384,24 @@ def setup_model_and_tokenizer_for_inference( class MCoreEngineWithCleanup: - """Wrapper around MCoreEngine that ensures proper cleanup of distributed resources. + """Wrapper around MegatronLLM that ensures proper cleanup of distributed resources. - This class delegates all operations to the underlying MCoreEngine while ensuring that - distributed resources are properly cleaned up when the engine is destroyed. + This class delegates all operations to the underlying MegatronLLM engine while ensuring + that distributed resources are properly cleaned up when the engine is destroyed. """ def __init__( self, - mcore_engine: MCoreEngine, - model_inference_wrapper: GPTInferenceWrapper, + llm: MegatronLLM, tokenizer: Union[MCoreTokenizerWrappper, MegatronTokenizer], ): """Initialize the MCoreEngineWithCleanup. Args: - mcore_engine (MCoreEngine): The underlying MCoreEngine instance - model_inference_wrapper (GPTInferenceWrapper): The model inference wrapper + llm (MegatronLLM): The underlying MegatronLLM instance tokenizer (Union[MCoreTokenizerWrappper, MegatronTokenizer]): The tokenizer instance """ - self.mcore_engine = mcore_engine - self.model_inference_wrapper = model_inference_wrapper + self.mcore_engine = llm self.tokenizer = tokenizer def __del__(self): @@ -446,8 +437,8 @@ def create_mcore_engine( buffer_size_gb: float = 10.0, legacy_model_format: bool = False, **model_config_kwargs, -) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: - """Set up the model, tokenizer and MCoreEngine for inference. +) -> Tuple[MCoreEngineWithCleanup, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: + """Set up the model, tokenizer and MegatronLLM engine for inference. Args: path (Path): Path to the checkpoint file @@ -455,7 +446,7 @@ def create_mcore_engine( inference_batch_times_seqlen_threshold (int): Threshold for batch size times sequence length inference_max_seq_length (int): Maximum sequence length for inference max_batch_size (int): Maximum batch size for inference - random_seed (Optional[int]): Random seed for reproducibility + random_seed (Optional[int]): Random seed for reproducibility (set globally during init) tensor_model_parallel_size (Optional[int]): Size of tensor model parallelism pipeline_model_parallel_size (Optional[int]): Size of pipeline model parallelism context_parallel_size (Optional[int]): Size of context parallelism @@ -466,11 +457,10 @@ def create_mcore_engine( model_type (str): Type of model to load (default: "gpt") model_format (str): Format of model to load (default: "nemo") micro_batch_size (Optional[int]): Micro batch size for model execution - legacy_model_format (bool): Whether to use the legacy StaticInferenceEngine path in MCoreEngine (default: False) + legacy_model_format (bool): Deprecated; no longer used (DynamicInferenceEngine is always used) Returns: - Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: Tuple containing: + Tuple[MCoreEngineWithCleanup, Union[MCoreTokenizerWrappper, MegatronTokenizer]]: Tuple containing: - MCoreEngineWithCleanup: Engine for text generation with proper cleanup - - GPTInferenceWrapper: Inference-wrapped model - Union[MCoreTokenizerWrappper, MegatronTokenizer]: Tokenizer instance """ # Default to 1 for any parallelism dimension that's None @@ -512,34 +502,32 @@ def create_mcore_engine( else: raise ValueError(f"Model format {model_format} not supported.") - # MLA models require block_size_tokens=64 for the dynamic engine, which is not - # configurable in the current Megatron-LM version. Fall back to the legacy static - # engine so MLA inference works correctly without touching Megatron-LM. + model.eval() + + # MLA models require block_size_tokens=64 for correct KV cache operation with the + # dynamic inference engine. Set the attention backend to flash if not already set. + block_size_tokens = 256 model_config = getattr(model, "config", None) if isinstance(model_config, MLATransformerConfig): - legacy_model_format = True - # The legacy static engine requires an explicit attention backend. - # MLA models use flash attention (attention_mask is handled internally). + block_size_tokens = 64 if not model_config.attention_backend: model_config.attention_backend = AttnBackend.flash - inference_context = StaticInferenceContext( - max_batch_size=max_batch_size, + inference_config = InferenceConfig( max_sequence_length=inference_max_seq_length, + buffer_size_gb=int(buffer_size_gb), + max_requests=max_batch_size, + block_size_tokens=block_size_tokens, + materialize_only_last_token_logits=True, ) - model_inference_wrapper = GPTInferenceWrapper(model, inference_context) - text_generation_controller = TextGenerationController( - inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer - ) - mcore_engine = MCoreEngine( - text_generation_controller=text_generation_controller, - max_batch_size=max_batch_size, - random_seed=random_seed, - buffer_size_gb=buffer_size_gb, - legacy=legacy_model_format, + + llm = MegatronLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, ) # Wrap the engine to ensure cleanup - wrapped_engine = MCoreEngineWithCleanup(mcore_engine, model_inference_wrapper, tokenizer) + wrapped_engine = MCoreEngineWithCleanup(llm, tokenizer) - return wrapped_engine, model_inference_wrapper, tokenizer + return wrapped_engine, tokenizer diff --git a/nemo_deploy/llm/megatronllm_deployable.py b/nemo_deploy/llm/megatronllm_deployable.py index ccbeddb12..10910ac90 100755 --- a/nemo_deploy/llm/megatronllm_deployable.py +++ b/nemo_deploy/llm/megatronllm_deployable.py @@ -21,8 +21,8 @@ import torch import torch.distributed from jinja2 import Template -from megatron.core.inference.common_inference_params import CommonInferenceParams -from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.sampling_params import SamplingParams from nemo_deploy import ITritonDeployable from nemo_deploy.llm.inference.inference_base import create_mcore_engine @@ -113,7 +113,7 @@ def __init__( if model_type not in ["gpt", "mamba"]: raise ValueError(f"Model type {model_type} not supported for Megatron models.") - self.mcore_engine, self.inference_wrapped_model, self.mcore_tokenizer = create_mcore_engine( + self.mcore_engine, self.mcore_tokenizer = create_mcore_engine( num_devices=num_devices, num_nodes=num_nodes, path=Path(megatron_checkpoint_filepath), @@ -144,18 +144,18 @@ def __init__( def generate( self, prompts: List[str], - inference_params: Optional[CommonInferenceParams] = None, - ) -> List[InferenceRequest]: + inference_params: Optional[SamplingParams] = None, + ) -> List[DynamicInferenceRequest]: """Generates text based on the provided input prompts. Args: prompts (List[str]): A list of input strings. - inference_params (Optional[CommonInferenceParams]): Parameters for controlling the inference process. + inference_params (Optional[SamplingParams]): Parameters for controlling the inference process. Returns: - List[InferenceRequest]: A list containing the generated results. + List[DynamicInferenceRequest]: A list containing the generated results. """ - inference_params = inference_params or CommonInferenceParams() + inference_params = inference_params or SamplingParams() # Store the original number of prompts orig_num_prompts = len(prompts) @@ -173,8 +173,7 @@ def generate( results = self.mcore_engine.generate( prompts=padded_prompts, - add_BOS=False, - common_inference_params=inference_params, + sampling_params=inference_params, ) # Only return results for the original prompts @@ -182,8 +181,7 @@ def generate( else: results = self.mcore_engine.generate( prompts=prompts, - add_BOS=False, - common_inference_params=inference_params, + sampling_params=inference_params, ) return list(results) @@ -198,7 +196,7 @@ def generate_other_ranks(self): data=[None], src=0 ) - inference_params = CommonInferenceParams( + inference_params = SamplingParams( temperature=temperature, top_k=int(top_k), top_p=float(top_p), @@ -208,7 +206,7 @@ def generate_other_ranks(self): ) if log_probs: - dynamic_engine = getattr(self.mcore_engine, "dynamic_engine", None) + dynamic_engine = getattr(self.mcore_engine, "engine", None) if dynamic_engine is not None: dynamic_engine.materialize_only_last_token_logits = False dynamic_engine.context.config.materialize_only_last_token_logits = False @@ -419,15 +417,15 @@ def _infer_fn( ) # cast top_k,top_p to native int, float since typecheck assert statements added in MCore0.13 error otherwise - # return_prompt_top_n_logprobs returns top_logprobs for prompt tokens too when top_logprobs>0. - inference_params = CommonInferenceParams( + # skip_prompt_log_probs=False (default) includes prompt tokens in top-N logprobs when top_logprobs>0. + inference_params = SamplingParams( temperature=temperature, top_k=int(top_k), top_p=float(top_p), num_tokens_to_generate=num_tokens_to_generate, return_log_probs=log_probs, top_n_logprobs=top_logprobs, - return_prompt_top_n_logprobs=bool(top_logprobs), + skip_prompt_log_probs=not bool(top_logprobs), stop_words=stop_words, ) @@ -436,7 +434,7 @@ def _infer_fn( # (prompt log probs are required for logprob eval benchmarks). # Toggle it on both the engine and the context config (controls the # model forward pass and log prob calculations). - dynamic_engine = getattr(self.mcore_engine, "dynamic_engine", None) + dynamic_engine = getattr(self.mcore_engine, "engine", None) needs_all_logits = log_probs or bool(top_logprobs) if dynamic_engine is not None and needs_all_logits: dynamic_engine.materialize_only_last_token_logits = False diff --git a/tests/functional_tests/utils/run_nemo_export.py b/tests/functional_tests/utils/run_nemo_export.py index ce6938299..521be9255 100644 --- a/tests/functional_tests/utils/run_nemo_export.py +++ b/tests/functional_tests/utils/run_nemo_export.py @@ -35,13 +35,13 @@ in_framework_supported = True try: - from megatron.core.inference.common_inference_params import CommonInferenceParams + from megatron.core.inference.sampling_params import SamplingParams from nemo_deploy.llm import NemoQueryLLMPyTorch from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable except Exception as e: LOGGER.warning( - "Cannot import MegatronLLMDeployable class, or NemoQueryLLMPyTorch, or CommonInferenceParams, " + "Cannot import MegatronLLMDeployable class, or NemoQueryLLMPyTorch, or SamplingParams, " f"in-framework inference will not be available. Reason: {type(e).__name__}: {e}" ) in_framework_supported = False @@ -98,7 +98,7 @@ def get_accuracy_with_lambada(model, nq, lora_uids, test_data_path, use_vllm: bo if in_framework_supported and isinstance(model, MegatronLLMDeployable): model_output = model.generate( prompts=[prompt], - inference_params=CommonInferenceParams( + inference_params=SamplingParams( temperature=0.1, top_k=1, top_p=0.0, diff --git a/tests/unit_tests/deploy/test_etp_sequence_parallel.py b/tests/unit_tests/deploy/test_etp_sequence_parallel.py index 732f2d067..6cdc1069f 100644 --- a/tests/unit_tests/deploy/test_etp_sequence_parallel.py +++ b/tests/unit_tests/deploy/test_etp_sequence_parallel.py @@ -389,19 +389,14 @@ class TestCreateMcoreEngineETPSequenceParallel(unittest.TestCase): """Tests that create_mcore_engine handles ETP/SP defaults and passes them down.""" @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - def test_etp_defaults_to_1_when_none( - self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup - ): + def test_etp_defaults_to_1_when_none(self, mock_cleanup, mock_llm_cls, mock_setup): """expert_tensor_parallel_size=None is normalised to 1 before forwarding.""" from nemo_deploy.llm.inference.inference_base import create_mcore_engine mock_setup.return_value = ([MagicMock()], MagicMock()) - mock_engine_cls.return_value = MagicMock() + mock_llm_cls.return_value = MagicMock() mock_cleanup.return_value = MagicMock() create_mcore_engine(path=Path("/fake"), model_format="nemo", expert_tensor_parallel_size=None) @@ -410,19 +405,14 @@ def test_etp_defaults_to_1_when_none( assert kwargs["expert_tensor_parallel_size"] == 1 @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - def test_sp_defaults_to_1_when_none( - self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup - ): + def test_sp_defaults_to_1_when_none(self, mock_cleanup, mock_llm_cls, mock_setup): """sequence_parallel=None is normalised to 1 before forwarding.""" from nemo_deploy.llm.inference.inference_base import create_mcore_engine mock_setup.return_value = ([MagicMock()], MagicMock()) - mock_engine_cls.return_value = MagicMock() + mock_llm_cls.return_value = MagicMock() mock_cleanup.return_value = MagicMock() create_mcore_engine(path=Path("/fake"), model_format="nemo", sequence_parallel=None) @@ -431,19 +421,14 @@ def test_sp_defaults_to_1_when_none( assert kwargs["sequence_parallel"] == 1 @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - def test_explicit_etp_passed_through( - self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup - ): + def test_explicit_etp_passed_through(self, mock_cleanup, mock_llm_cls, mock_setup): """An explicit expert_tensor_parallel_size value is forwarded unchanged.""" from nemo_deploy.llm.inference.inference_base import create_mcore_engine mock_setup.return_value = ([MagicMock()], MagicMock()) - mock_engine_cls.return_value = MagicMock() + mock_llm_cls.return_value = MagicMock() mock_cleanup.return_value = MagicMock() create_mcore_engine(path=Path("/fake"), model_format="nemo", expert_tensor_parallel_size=4) @@ -452,19 +437,14 @@ def test_explicit_etp_passed_through( assert kwargs["expert_tensor_parallel_size"] == 4 @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") @patch("nemo_deploy.llm.inference.inference_base.MCoreEngineWithCleanup") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - def test_explicit_sp_passed_through( - self, mock_tgc, mock_ctx, mock_wrapper, mock_cleanup, mock_engine_cls, mock_setup - ): + def test_explicit_sp_passed_through(self, mock_cleanup, mock_llm_cls, mock_setup): """An explicit sequence_parallel=True value is forwarded unchanged.""" from nemo_deploy.llm.inference.inference_base import create_mcore_engine mock_setup.return_value = ([MagicMock()], MagicMock()) - mock_engine_cls.return_value = MagicMock() + mock_llm_cls.return_value = MagicMock() mock_cleanup.return_value = MagicMock() create_mcore_engine(path=Path("/fake"), model_format="nemo", sequence_parallel=True) @@ -487,7 +467,7 @@ def test_expert_tensor_parallel_size_forwarded(self, mock_create): """expert_tensor_parallel_size is forwarded to create_mcore_engine.""" from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable - mock_create.return_value = (MagicMock(), MagicMock(), MagicMock()) + mock_create.return_value = (MagicMock(), MagicMock()) MegatronLLMDeployable( megatron_checkpoint_filepath="model.ckpt", @@ -503,7 +483,7 @@ def test_sequence_parallel_forwarded(self, mock_create): """sequence_parallel is forwarded to create_mcore_engine.""" from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable - mock_create.return_value = (MagicMock(), MagicMock(), MagicMock()) + mock_create.return_value = (MagicMock(), MagicMock()) MegatronLLMDeployable( megatron_checkpoint_filepath="model.ckpt", @@ -519,7 +499,7 @@ def test_defaults_etp_1_and_sp_false(self, mock_create): """Defaults: expert_tensor_parallel_size=1, sequence_parallel=False.""" from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable - mock_create.return_value = (MagicMock(), MagicMock(), MagicMock()) + mock_create.return_value = (MagicMock(), MagicMock()) MegatronLLMDeployable(megatron_checkpoint_filepath="model.ckpt") diff --git a/tests/unit_tests/deploy/test_inference_base.py b/tests/unit_tests/deploy/test_inference_base.py index d80c7dd2c..c35f4edea 100644 --- a/tests/unit_tests/deploy/test_inference_base.py +++ b/tests/unit_tests/deploy/test_inference_base.py @@ -19,10 +19,7 @@ import pytest import torch -from megatron.core.inference.engines.mcore_engine import MCoreEngine -from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( - GPTInferenceWrapper, -) +from megatron.core.inference.apis import MegatronLLM from megatron.core.transformer.module import MegatronModule from nemo_deploy.llm.inference.inference_base import ( @@ -322,33 +319,31 @@ def test_setup_model_and_tokenizer_not_dist_ckpt( mock_check_dist.assert_called_once() def test_mcore_engine_with_cleanup(self): - # Create mocks for the engine and wrapper - mock_engine = MagicMock(spec=MCoreEngine) - mock_wrapper = MagicMock(spec=GPTInferenceWrapper) + # Create mock for the LLM engine + mock_llm = MagicMock(spec=MegatronLLM) # Create the wrapper - engine_wrapper = MCoreEngineWithCleanup(mock_engine, mock_wrapper, self.mock_tokenizer) + engine_wrapper = MCoreEngineWithCleanup(mock_llm, self.mock_tokenizer) # Test attribute delegation - mock the attribute access directly instead of using __getattr__ # Define the attribute directly on the mock - mock_engine.some_attribute = "attribute_value" + mock_llm.some_attribute = "attribute_value" attribute_value = engine_wrapper.some_attribute self.assertEqual(attribute_value, "attribute_value") # Test method delegation - create a method on the mock - mock_engine.some_method = MagicMock(return_value="method_result") + mock_llm.some_method = MagicMock(return_value="method_result") result = engine_wrapper.some_method() self.assertEqual(result, "method_result") - mock_engine.some_method.assert_called_once() + mock_llm.some_method.assert_called_once() @patch("nemo_deploy.llm.inference.inference_base.cleanup_distributed") def test_mcore_engine_with_cleanup_del(self, mock_cleanup): - # Create mocks - mock_engine = MagicMock(spec=MCoreEngine) - mock_wrapper = MagicMock(spec=GPTInferenceWrapper) + # Create mock for the LLM engine + mock_llm = MagicMock(spec=MegatronLLM) # Create the wrapper - engine_wrapper = MCoreEngineWithCleanup(mock_engine, mock_wrapper, self.mock_tokenizer) + engine_wrapper = MCoreEngineWithCleanup(mock_llm, self.mock_tokenizer) # Call __del__ engine_wrapper.__del__() @@ -781,26 +776,20 @@ def test_setup_model_and_tokenizer_model_config_kwargs( self.assertEqual(self.model_config.hidden_size, 1024) @patch("nemo_deploy.llm.inference.inference_base.setup_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") def test_create_mcore_engine_nemo_format( self, - mock_mcore_engine, - mock_text_ctrl, - mock_gpt_wrapper, - mock_static_ctx, + mock_megatron_llm, mock_setup, ): """Test create_mcore_engine with nemo model_format.""" mock_model = MagicMock() mock_tokenizer = MagicMock() mock_setup.return_value = ([mock_model], mock_tokenizer) - mock_engine_instance = MagicMock() - mock_mcore_engine.return_value = mock_engine_instance + mock_llm_instance = MagicMock() + mock_megatron_llm.return_value = mock_llm_instance - engine, wrapper, tokenizer = create_mcore_engine( + engine, tokenizer = create_mcore_engine( path=self.mock_path, model_format="nemo", inference_max_seq_length=2048, @@ -808,20 +797,14 @@ def test_create_mcore_engine_nemo_format( ) mock_setup.assert_called_once() - mock_mcore_engine.assert_called_once() + mock_megatron_llm.assert_called_once() self.assertIsNotNone(engine) @patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference") - @patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext") - @patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper") - @patch("nemo_deploy.llm.inference.inference_base.TextGenerationController") - @patch("nemo_deploy.llm.inference.inference_base.MCoreEngine") + @patch("nemo_deploy.llm.inference.inference_base.MegatronLLM") def test_create_mcore_engine_megatron_format( self, - mock_mcore_engine, - mock_text_ctrl, - mock_gpt_wrapper, - mock_static_ctx, + mock_megatron_llm, mock_setup, ): """Test create_mcore_engine with megatron model_format.""" @@ -829,10 +812,10 @@ def test_create_mcore_engine_megatron_format( mock_tokenizer = MagicMock() mock_mlm_args = MagicMock() mock_setup.return_value = ([mock_model], mock_tokenizer, mock_mlm_args) - mock_engine_instance = MagicMock() - mock_mcore_engine.return_value = mock_engine_instance + mock_llm_instance = MagicMock() + mock_megatron_llm.return_value = mock_llm_instance - engine, wrapper, tokenizer = create_mcore_engine( + engine, tokenizer = create_mcore_engine( path=self.mock_path, model_format="megatron", inference_max_seq_length=2048, @@ -840,7 +823,7 @@ def test_create_mcore_engine_megatron_format( ) mock_setup.assert_called_once() - mock_mcore_engine.assert_called_once() + mock_megatron_llm.assert_called_once() self.assertIsNotNone(engine) @patch("nemo_deploy.llm.inference.inference_base.torch_distributed_init") diff --git a/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py index 666e4779c..81a6c760e 100644 --- a/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py +++ b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py @@ -18,7 +18,7 @@ import numpy as np import pytest import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from PIL import Image from nemo_deploy.multimodal.megatron_multimodal_deployable import MegatronMultimodalDeployable @@ -157,7 +157,7 @@ def test_generate_method(self, deployable, sample_image): """Test the generate method.""" prompts = ["Test prompt 1", "Test prompt 2"] images = [sample_image, sample_image] - inference_params = CommonInferenceParams(temperature=0.7, top_k=10, top_p=0.9, num_tokens_to_generate=100) + inference_params = SamplingParams(temperature=0.7, top_k=10, top_p=0.9, num_tokens_to_generate=100) with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate: with patch.object(deployable, "apply_chat_template", side_effect=lambda x: x): @@ -282,8 +282,8 @@ def test_infer_fn(self, deployable, sample_image_base64, sample_image): assert call_args[0][0] == prompts # Images should be converted from base64 assert len(call_args[0][1]) == 2 - # Check that inference_params is a CommonInferenceParams object (3rd positional arg) - assert isinstance(call_args[0][2], CommonInferenceParams) + # Check that inference_params is a SamplingParams object (3rd positional arg) + assert isinstance(call_args[0][2], SamplingParams) assert call_args[0][2].temperature == 0.8 assert call_args[0][2].top_k == 20 assert call_args[0][2].top_p == 0.95 @@ -318,8 +318,8 @@ def test_infer_fn_default_params(self, deployable, sample_image_base64, sample_i assert call_args[0][0] == prompts # Images should be converted from base64 assert len(call_args[0][1]) == 1 - # Check that inference_params is a CommonInferenceParams object (3rd positional arg) - assert isinstance(call_args[0][2], CommonInferenceParams) + # Check that inference_params is a SamplingParams object (3rd positional arg) + assert isinstance(call_args[0][2], SamplingParams) assert call_args[0][2].temperature == 1.0 assert call_args[0][2].top_k == 1 assert call_args[0][2].top_p == 0.0 @@ -357,7 +357,7 @@ def test_infer_fn_with_temperature_zero(self, deployable): call_args = mock_generate.call_args # Check that inference_params has greedy sampling parameters - assert isinstance(call_args[0][2], CommonInferenceParams) + assert isinstance(call_args[0][2], SamplingParams) assert call_args[0][2].temperature == 0.0 # Kept as 0.0 assert call_args[0][2].top_k == 1 # Overridden for greedy sampling assert call_args[0][2].top_p == 0.0 # Overridden for greedy sampling diff --git a/tests/unit_tests/deploy/test_megatronllm_deployable.py b/tests/unit_tests/deploy/test_megatronllm_deployable.py index af6902770..0e1ef4d4f 100644 --- a/tests/unit_tests/deploy/test_megatronllm_deployable.py +++ b/tests/unit_tests/deploy/test_megatronllm_deployable.py @@ -16,7 +16,7 @@ import numpy as np import pytest -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from nemo_deploy.llm.megatronllm_deployable import MegatronLLMDeployable, dict_to_str from nemo_export_deploy_common.import_utils import UnavailableError @@ -121,7 +121,7 @@ def test_generate_with_cuda_graphs_empty_prompts(deployable): deployable.enable_cuda_graphs = True deployable.max_batch_size = 4 prompts = [] - inference_params = CommonInferenceParams() + inference_params = SamplingParams() with patch.object(deployable.mcore_engine, "generate") as mock_generate: mock_generate.return_value = ["", "", "", ""] @@ -197,13 +197,13 @@ def test_generate_other_ranks_disables_materialize_when_log_probs(deployable): [1.0, 1, 0.0, 256, True, None], # log_probs=True ] - deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True - deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True deployable.generate_other_ranks() - assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False - assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False @pytest.mark.run_only_on("GPU") @@ -387,7 +387,7 @@ def test_generate_without_cuda_graphs(deployable): deployable.enable_cuda_graphs = False prompts = ["Hello", "World"] - inference_params = CommonInferenceParams( + inference_params = SamplingParams( temperature=1.0, top_k=1, top_p=0.0, @@ -403,7 +403,7 @@ def test_generate_without_cuda_graphs(deployable): results = deployable.generate(prompts, inference_params) assert len(results) == 2 - mock_generate.assert_called_once_with(prompts=prompts, add_BOS=False, common_inference_params=inference_params) + mock_generate.assert_called_once_with(prompts=prompts, sampling_params=inference_params) @pytest.mark.run_only_on("GPU") @@ -414,7 +414,7 @@ def test_generate_with_cuda_graphs(deployable): deployable.max_batch_size = 4 prompts = ["Hello", "World"] - inference_params = CommonInferenceParams( + inference_params = SamplingParams( temperature=1.0, top_k=1, top_p=0.0, @@ -446,8 +446,7 @@ def test_generate_with_cuda_graphs(deployable): called_args = mock_generate.call_args[1] assert len(called_args["prompts"]) == 4 # Should pad to max_batch_size assert called_args["prompts"][:2] == prompts # Original prompts should be first - assert called_args["add_BOS"] is False - assert called_args["common_inference_params"] == inference_params + assert called_args["sampling_params"] == inference_params @pytest.mark.run_only_on("GPU") @@ -607,8 +606,8 @@ def test_init_with_megatron_valid_types(model_type): patch("nemo_deploy.llm.megatronllm_deployable.HAVE_TRITON", True), patch("nemo_deploy.llm.megatronllm_deployable.create_mcore_engine") as mock_create, ): - mock_engine, mock_model, mock_tokenizer = MagicMock(), MagicMock(), MagicMock() - mock_create.return_value = (mock_engine, mock_model, mock_tokenizer) + mock_engine, mock_tokenizer = MagicMock(), MagicMock() + mock_create.return_value = (mock_engine, mock_tokenizer) deployable = MegatronLLMDeployable( megatron_checkpoint_filepath="bar.ckpt", @@ -716,7 +715,7 @@ def test_infer_fn_basic(deployable): call_args = mock_generate.call_args[0] assert call_args[0] == prompts - # Verify CommonInferenceParams + # Verify SamplingParams inference_params = mock_generate.call_args[0][1] assert inference_params.temperature == 1.0 assert inference_params.top_k == 1 @@ -1076,13 +1075,13 @@ def test_infer_fn_disables_materialize_only_last_token_logits_when_log_probs(dep mock_tensor_instance.cpu.return_value.detach.return_value.numpy.return_value = np.array([0.1]) mock_tensor.return_value = mock_tensor_instance - deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True - deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True deployable._infer_fn(prompts=["Hello"], log_probs=True) - assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False - assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False @pytest.mark.run_only_on("GPU") @@ -1100,13 +1099,13 @@ def test_infer_fn_disables_materialize_only_last_token_logits_when_top_logprobs( mock_remove_eos.return_value = ["text"] mock_dict_to_str.return_value = '{"tok": 0.1}' - deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True - deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True deployable._infer_fn(prompts=["Hello"], top_logprobs=5) - assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is False - assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.materialize_only_last_token_logits is False + assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is False @pytest.mark.run_only_on("GPU") @@ -1121,10 +1120,10 @@ def test_infer_fn_keeps_materialize_only_last_token_logits_when_no_logprobs(depl mock_generate.return_value = [mock_result] mock_remove_eos.return_value = ["text"] - deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits = True - deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.materialize_only_last_token_logits = True + deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits = True deployable._infer_fn(prompts=["Hello"], log_probs=False, top_logprobs=0) - assert deployable.mcore_engine.dynamic_engine.materialize_only_last_token_logits is True - assert deployable.mcore_engine.dynamic_engine.context.config.materialize_only_last_token_logits is True + assert deployable.mcore_engine.engine.materialize_only_last_token_logits is True + assert deployable.mcore_engine.engine.context.config.materialize_only_last_token_logits is True