-
Notifications
You must be signed in to change notification settings - Fork 5
[Feature] Adding test-deployment-baremetal to the vllm_performance actuator #784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5d9ddcf
c9858d4
2903085
62a320b
d772ce9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,11 +12,14 @@ | |
| VLLMPerformanceTestParameters, | ||
| ) | ||
| from ado_actuators.vllm_performance.env_manager import ( | ||
| BareMetalEnvironmentManager, | ||
| EnvironmentManager, | ||
| K8SEnvironmentManager, | ||
| ) | ||
| from ado_actuators.vllm_performance.experiment_executor import ( | ||
| run_resource_and_workload_experiment, | ||
| run_workload_experiment, | ||
| run_serve_and_workload_experiment, | ||
| ) | ||
|
|
||
| from orchestrator.core.actuatorconfiguration.config import GenericActuatorParameters | ||
|
|
@@ -104,11 +107,13 @@ def __init__( | |
| # Set parameters | ||
| self.actuator_parameters = params | ||
| # çreate environment manager actor | ||
| self.env_manager = None | ||
| self.env_manager: EnvironmentManager = None | ||
|
|
||
| logger.debug(f"Actuator initialized with parameters: {params}") | ||
|
|
||
| if self.actuator_parameters.namespace: | ||
| try: | ||
| self.env_manager = EnvironmentManager.remote( | ||
| self.env_manager = K8SEnvironmentManager.remote( | ||
| namespace=params.namespace, | ||
| max_concurrent=params.max_environments, | ||
| in_cluster=params.in_cluster, | ||
|
|
@@ -134,9 +139,34 @@ def __init__( | |
| ) | ||
| else: | ||
| self.log.warning( | ||
| "No namespace set in acutator configuration - will not be able to create deployments" | ||
| "No namespace set in acutator configuration - will not be able to create deployments using k8s" | ||
| ) | ||
|
|
||
| if self.actuator_parameters.baremetal: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If one sets the namespace, this would first initialize the K8sEnv Manger and then if This should be something along the lines of if self.actuator_parameters.namespace:
# Init K8s env
elif self.actuator_parameters.baremetal:
# init baremetal
else
# something along these lines
self.log.warning("No namespace set in actuator and not running in baremetal mode."
"This actuators will not be able to create neither kubernetes nor baremetal deployments")I would also add a validator to the actuator configuration model to make sure that at most one between |
||
| logger.warning( | ||
| "Baremetal mode enabled - make sure this actuator is running in the same environment where vLLM models are deployed" | ||
| ) | ||
| try: | ||
| self.env_manager = BareMetalEnvironmentManager.remote() | ||
| except Exception as error: | ||
| self.log.warning( | ||
| f"Unable to create baremetal environment manager due to {error}. " | ||
| f"Will not be able to execute experiments requiring deploying on baremetal machine" | ||
| ) | ||
| else: | ||
| # add to clean up | ||
| try: | ||
| cleaner_handle = ray.get_actor( | ||
| name=CLEANER_ACTOR, namespace=queue.ray_namespace() | ||
| ) | ||
| cleaner_handle.add_to_cleanup.remote(handle=self.env_manager) | ||
| except Exception as e: | ||
| logger.warning( | ||
| f"Failed to register custom actors for clean up {e}. Make sure you clean it up" | ||
| ) | ||
|
Comment on lines
+156
to
+166
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like the cleanup handle is set only for the baremetal env manager case? |
||
|
|
||
| logger.debug(f"Environment manager set to {self.env_manager}") | ||
|
|
||
| # initialize local port | ||
| self.local_port = 10000 | ||
|
|
||
|
|
@@ -245,6 +275,40 @@ async def submit( | |
| local_port=self.local_port, | ||
| ) | ||
| self.local_port += len(request.entities) | ||
| elif experiment.identifier in [ | ||
| "test-deployment-baremetal-v1", | ||
| ]: | ||
| if not self.env_manager: | ||
| raise MissingConfigurationForExperimentError( | ||
| f"Actuator configuration did not contain sufficient information for a baremetal environment manager to be created. " | ||
| f"Experiment {experiment} requires a baremetal environment manager to be executable." | ||
| ) | ||
| logger.info( | ||
| f"Experiment ({experiment}) - Running serve and workload experiment on baremetal environment" | ||
| ) | ||
|
|
||
| # We assume all entities have the same number of gpus required so we can just look at the first one | ||
| ray_options["num_gpus"] = experiment.propertyValuesFromEntity(request.entities[0]).get("n_gpus") | ||
| ray_options["runtime_env"] = { | ||
| "py_executable": "/scratch-shared/dniewenhuis/IBM/.venv_IBM_experiments/bin/python", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove. Why do you need to set the py_executable? |
||
| "env_vars": { | ||
| "CUDA_HOME": os.environ["CUDA_HOME"], | ||
| "LD_LIBRARY_PATH": os.environ["LD_LIBRARY_PATH"], | ||
| "PATH": os.environ["PATH"], | ||
| } | ||
| } | ||
|
|
||
| logger.debug( | ||
| f"Starting experiment with options: {ray_options}" | ||
| ) | ||
|
|
||
| run_serve_and_workload_experiment.options(**ray_options).remote( | ||
| request=request, | ||
| experiment=experiment, | ||
| state_update_queue=self._stateUpdateQueue, | ||
| actuator_parameters=self.actuator_parameters, | ||
| env_manager=self.env_manager, | ||
| ) | ||
| else: | ||
| run_workload_experiment.options(**ray_options).remote( | ||
| request=request, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,6 +79,13 @@ class VLLMPerformanceTestParameters(GenericActuatorParameters): | |
| description="If true, disables automatic installation of vllm and guidellm dependencies in Ray task environment. Useful for development when dependencies are already installed." | ||
| ), | ||
| ] = False | ||
| baremetal: Annotated[ | ||
| bool, | ||
| pydantic.Field( | ||
| description="If true, This assumes vllm models are deployed in a bare metal setting." | ||
| ), | ||
| ] = False | ||
|
|
||
|
Comment on lines
+82
to
+88
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a validator that checks at most one between baremetal and namespace is set. THis helps with reducing the amount of checks to be done at runtime. |
||
|
|
||
| @pydantic.model_validator(mode="before") | ||
| @classmethod | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| # Copyright IBM Corporation 2025, 2026 | ||
| # SPDX-License-Identifier: MIT | ||
|
|
||
| import logging | ||
|
|
||
| from ado_actuators.vllm_performance.k8s.manage_components import ( | ||
| ComponentsManager, | ||
| ) | ||
| from ado_actuators.vllm_performance.k8s.yaml_support.build_components import ( | ||
| ComponentsYaml, | ||
| VLLMDtype, | ||
| ) | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def create_test_environment( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is never used in the code. |
||
| model_name: str, | ||
| base_url: str = "http://localhost:8000", | ||
| tensor_parallel_size: int = 1, | ||
| max_model_len: int = -1, | ||
| max_num_batched_tokens: int = -1, | ||
| max_num_seqs: int = 256, | ||
| hf_token: str | None = None): | ||
| """ | ||
| This function serves vLLM with the given configuration. | ||
| This is used for the test-deployment-baremetal-v1 experiment where we want to test the performance of vLLM | ||
| on a baremetal machine without doing any kubernetes deployment. This is useful to isolate the performance of vLLM | ||
| from the performance of the kubernetes deployment. | ||
|
|
||
| The function will: | ||
| 1. Check if vLLM is already running with the given configuration. If it is, it will return the URL of the vLLM server. | ||
| 2. Serve vLLM with the given configuration if it is not already running, and return the URL of the vLLM server. | ||
| """ | ||
|
|
||
| import requests | ||
| import uuid | ||
|
|
||
| print(f"Start loading the LLM") | ||
|
|
||
| log_file_name = f"vllm_serve-{uuid.uuid4()}.log" | ||
| log_file = open(log_file_name, "w") | ||
|
|
||
| logger.debug(f"Starting vLLM server and logging to {log_file_name}...") | ||
|
|
||
| env = dict(os.environ) | ||
| env["VLLM_BENCH_LOGLEVEL"] = logging.getLevelName(logger.getEffectiveLevel()) | ||
|
|
||
| if hf_token is not None: | ||
| env["HF_TOKEN"] = hf_token | ||
|
|
||
| command = ["vllm", | ||
| "serve", | ||
| model_name, | ||
| "--tensor-parallel-size", | ||
| str(tensor_parallel_size), | ||
| "--max-num-seqs", | ||
| str(max_num_seqs), | ||
| "--host", | ||
| "0.0.0.0", | ||
| "--port", | ||
| "8000"] | ||
|
|
||
| if max_model_len > 0: | ||
| command += ["--max-model-len", str(max_model_len)] | ||
| if max_num_batched_tokens > 0: | ||
| command += ["--max-num-batched-tokens", str(max_num_batched_tokens)] | ||
|
|
||
| proc = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT,) | ||
|
|
||
| logger.debug(f"Waiting for the server to be ready...") | ||
|
|
||
| success = False | ||
| while True: | ||
| try: | ||
| r = requests.get(f"{base_url}/v1/models", timeout=2) | ||
|
|
||
| if r.status_code == 200: | ||
| logger.debug("Server is ready!") | ||
| success = True | ||
| break | ||
| except requests.RequestException: | ||
| logger.debug("Server is not ready yet...") | ||
| pass # still not ready | ||
|
|
||
| poll = proc.poll() | ||
| logger.debug(f"process poll: {poll}") | ||
|
|
||
| if poll is not None: # Check if the process is still running: None -> still running | ||
| logger.error(f"Serving vLLM crashed. Check logs: {log_file_name}") | ||
|
|
||
| log_file.close() | ||
|
|
||
| return success | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| t_model = "meta-llama/Llama-3.1-8B-Instruct" | ||
| create_test_environment( | ||
| k8s_name=ComponentsYaml.get_k8s_name(model=t_model), | ||
| in_cluster=False, | ||
| verify_ssl=False, | ||
| model=t_model, | ||
| pvc_name="vllm-support", | ||
| image="quay.io/dataprep1/data-prep-kit/vllm_image:0.1", | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still needed?