IBM · DanteNiewenhuis · Apr 2, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 7, 2026
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py
@@ -12,11 +12,14 @@
     VLLMPerformanceTestParameters,
 )
 from ado_actuators.vllm_performance.env_manager import (
+    BareMetalEnvironmentManager,
     EnvironmentManager,
+    K8SEnvironmentManager,
 )
 from ado_actuators.vllm_performance.experiment_executor import (
     run_resource_and_workload_experiment,
     run_workload_experiment,
+    run_serve_and_workload_experiment,
 )
 
 from orchestrator.core.actuatorconfiguration.config import GenericActuatorParameters
@@ -104,11 +107,13 @@ def __init__(
         # Set parameters
         self.actuator_parameters = params
         # çreate environment manager actor
-        self.env_manager = None
+        self.env_manager: EnvironmentManager = None
+
+        logger.debug(f"Actuator initialized with parameters: {params}")
 
         if self.actuator_parameters.namespace:
             try:
-                self.env_manager = EnvironmentManager.remote(
+                self.env_manager = K8SEnvironmentManager.remote(
                     namespace=params.namespace,
                     max_concurrent=params.max_environments,
                     in_cluster=params.in_cluster,
@@ -134,9 +139,34 @@ def __init__(
                     )
         else:
             self.log.warning(
-                "No namespace set in acutator configuration - will not be able to create deployments"
+                "No namespace set in acutator configuration - will not be able to create deployments using k8s"
             )
+
+        if self.actuator_parameters.baremetal:
+            logger.warning(
+                "Baremetal mode enabled - make sure this actuator is running in the same environment where vLLM models are deployed"
+            )
+            try:
+                self.env_manager = BareMetalEnvironmentManager.remote()
+            except Exception as error:
+                self.log.warning(
+                    f"Unable to create baremetal environment manager due to {error}. "
+                    f"Will not be able to execute experiments requiring deploying on baremetal machine"
+                )
+            else:
+                # add to clean up
+                try:
+                    cleaner_handle = ray.get_actor(
+                        name=CLEANER_ACTOR, namespace=queue.ray_namespace()
+                    )
+                    cleaner_handle.add_to_cleanup.remote(handle=self.env_manager)
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to register custom actors for clean up {e}. Make sure you clean it up"
+                    )
 
+        logger.debug(f"Environment manager set to {self.env_manager}")
+
         # initialize local port
         self.local_port = 10000
 
@@ -245,6 +275,40 @@ async def submit(
                 local_port=self.local_port,
             )
             self.local_port += len(request.entities)
+        elif experiment.identifier in [
+            "test-deployment-baremetal-v1",
+        ]:
+            if not self.env_manager:
+                raise MissingConfigurationForExperimentError(
+                    f"Actuator configuration did not contain sufficient information for a baremetal environment manager to be created. "
+                    f"Experiment {experiment} requires a baremetal environment manager to be executable."
+                )
+            logger.info(
+                f"Experiment ({experiment}) - Running serve and workload experiment on baremetal environment"
+            )
+
+            # We assume all entities have the same number of gpus required so we can just look at the first one
+            ray_options["num_gpus"] = experiment.propertyValuesFromEntity(request.entities[0]).get("n_gpus")
+            ray_options["runtime_env"] = {
+                "py_executable": "/scratch-shared/dniewenhuis/IBM/.venv_IBM_experiments/bin/python",
+                "env_vars": {
+                    "CUDA_HOME": os.environ["CUDA_HOME"],
+                    "LD_LIBRARY_PATH": os.environ["LD_LIBRARY_PATH"],
+                    "PATH": os.environ["PATH"],
+                }
+            }
+
+            logger.debug(
+                f"Starting experiment with options: {ray_options}"
+            )
+
+            run_serve_and_workload_experiment.options(**ray_options).remote(
+                request=request,
+                experiment=experiment,
+                state_update_queue=self._stateUpdateQueue,
+                actuator_parameters=self.actuator_parameters,
+                env_manager=self.env_manager,
+            )
         else:
             run_workload_experiment.options(**ray_options).remote(
                 request=request,

diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator_parameters.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator_parameters.py
@@ -79,6 +79,13 @@ class VLLMPerformanceTestParameters(GenericActuatorParameters):
             description="If true, disables automatic installation of vllm and guidellm dependencies in Ray task environment. Useful for development when dependencies are already installed."
         ),
     ] = False
+    baremetal: Annotated[
+        bool,
+        pydantic.Field(
+            description="If true, This assumes vllm models are deployed in a bare metal setting."
+        ),
+    ] = False
+
 
     @pydantic.model_validator(mode="before")
     @classmethod

diff --git a/...actuators/vllm_performance/ado_actuators/vllm_performance/baremetal/create_environment.py b/...actuators/vllm_performance/ado_actuators/vllm_performance/baremetal/create_environment.py
@@ -0,0 +1,106 @@
+# Copyright IBM Corporation 2025, 2026
+# SPDX-License-Identifier: MIT
+
+import logging
+
+from ado_actuators.vllm_performance.k8s.manage_components import (
+    ComponentsManager,
+)
+from ado_actuators.vllm_performance.k8s.yaml_support.build_components import (
+    ComponentsYaml,
+    VLLMDtype,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def create_test_environment(
+        model_name: str,
+        base_url: str = "http://localhost:8000",
+        tensor_parallel_size: int = 1,
+        max_model_len: int = -1,
+        max_num_batched_tokens: int = -1,
+        max_num_seqs: int = 256,
+        hf_token: str | None = None):
+    """
+    This function serves vLLM with the given configuration.
+    This is used for the test-deployment-baremetal-v1 experiment where we want to test the performance of vLLM
+    on a baremetal machine without doing any kubernetes deployment. This is useful to isolate the performance of vLLM
+    from the performance of the kubernetes deployment.
+
+    The function will:
+    1. Check if vLLM is already running with the given configuration. If it is, it will return the URL of the vLLM server.
+    2. Serve vLLM with the given configuration if it is not already running, and return the URL of the vLLM server.
+    """
+
+    import requests
+    import uuid
+
+    print(f"Start loading the LLM")
+
+    log_file_name = f"vllm_serve-{uuid.uuid4()}.log"
+    log_file = open(log_file_name, "w")
+
+    logger.debug(f"Starting vLLM server and logging to {log_file_name}...")
+
+    env = dict(os.environ)
+    env["VLLM_BENCH_LOGLEVEL"] = logging.getLevelName(logger.getEffectiveLevel())
+
+    if hf_token is not None:
+        env["HF_TOKEN"] = hf_token
+
+    command = ["vllm", 
+               "serve", 
+               model_name, 
+                "--tensor-parallel-size",
+                str(tensor_parallel_size),
+                "--max-num-seqs",
+                str(max_num_seqs),
+               "--host", 
+               "0.0.0.0", 
+               "--port", 
+               "8000"]
+
+    if max_model_len > 0:
+        command += ["--max-model-len", str(max_model_len)]
+    if max_num_batched_tokens > 0:
+        command += ["--max-num-batched-tokens", str(max_num_batched_tokens)]
+
+    proc = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT,)
+
+    logger.debug(f"Waiting for the server to be ready...")
+
+    success = False
+    while True:
+        try:
+            r = requests.get(f"{base_url}/v1/models", timeout=2)
+
+            if r.status_code == 200:
+                logger.debug("Server is ready!")
+                success = True
+                break
+        except requests.RequestException:
+            logger.debug("Server is not ready yet...")
+            pass  # still not ready    
+
+        poll = proc.poll()
+        logger.debug(f"process poll: {poll}")
+
+        if poll is not None: # Check if the process is still running: None -> still running
+            logger.error(f"Serving vLLM crashed. Check logs: {log_file_name}")
+
+    log_file.close()
+
+    return success
+
+
+if __name__ == "__main__":
+    t_model = "meta-llama/Llama-3.1-8B-Instruct"
+    create_test_environment(
+        k8s_name=ComponentsYaml.get_k8s_name(model=t_model),
+        in_cluster=False,
+        verify_ssl=False,
+        model=t_model,
+        pvc_name="vllm-support",
+        image="quay.io/dataprep1/data-prep-kit/vllm_image:0.1",
+    )