From f3def4527f66ca09d1c142c1e715aaa047e1d79d Mon Sep 17 00:00:00 2001 From: Ronelle Landy Date: Fri, 12 Jun 2026 08:16:07 +0300 Subject: [PATCH 1/2] Add MIG validations and remove GPU-specific conditionals This PR adds validations for MIG configurations: - Add flavor configuration for MIG testing - Update MIG checks and NVIDIA validations to work on all GPU types - Add RHEL guest repository support - Add DKMS installation and reboot handling - Install NVIDIA repos with custom RPM - Make CUDA repo available for installation - Install nvidia-container-toolkit and configure CDI - Setup CDI and NVIDIA Management Library - Add GPU utilization option in vLLM service - Move CUDA install tasks to separate file - Remove A30-specific conditionals to support all GPU types with updated RHEL 9.6 drivers - Fix reboot sequencing for proper driver and CDI initialization Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Ronelle Landy --- gpu-validation/defaults/main.yaml | 16 ++++++ gpu-validation/tasks/main.yaml | 6 ++ gpu-validation/tasks/nvidia-cuda-repos.yaml | 57 +++++++++++++++++++ gpu-validation/tasks/nvidia.yaml | 12 ++++ gpu-validation/tasks/nvidia_assertions.yaml | 15 ++++- gpu-validation/tasks/setup.yaml | 28 +++++++++ gpu-validation/tasks/vm_image.yaml | 13 +++-- .../templates/vllm-serve.service.j2 | 3 + requirements.yaml | 1 + 9 files changed, 143 insertions(+), 8 deletions(-) create mode 100644 gpu-validation/tasks/nvidia-cuda-repos.yaml diff --git a/gpu-validation/defaults/main.yaml b/gpu-validation/defaults/main.yaml index 8e679fe..1b27ff7 100644 --- a/gpu-validation/defaults/main.yaml +++ b/gpu-validation/defaults/main.yaml @@ -1,6 +1,8 @@ --- # [bool] Whether to deploy a VM before running tests gpu_validation_enabled: true +# [string] Mode of running the GPU-enabled device +gpu_validation_mode: pci_passthrough # [string] Name of the VM to create and destroy gpu_validation_vm_name: gpu-validation # [string] URL of the image to use when creating the VM @@ -17,9 +19,23 @@ gpu_validation_flavor_vcpus: 4 gpu_validation_flavor_disk: 80 # [string] Number of GPUs for the flavor gpu_validation_flavor_gpus: 1 +# [string] PCI alias name for GPU device assignment +gpu_validation_pci_alias: gpu-l4 + # [string] Time to wait for VM to become created, seconds gpu_validation_server_timeout: 180 +# [list] Extra specs for the flavor: +gpu_validation_extra_specs: + "pci_passthrough:alias": "{{ gpu_validation_pci_alias }}:{{ gpu_validation_flavor_gpus }}" + "hw:pci_numa_affinity_policy": "preferred" + "hw:hide_hypervisor_id": "true" + "hw:kvm_hidden": "true" + "hw:cpu_model": "host-passthrough" + +# [string] NVIDIA management library version +gpu_validation_libnvidia_ml_version: libnvidia-ml + # [string] Keypair to use when creating the VM gpu_validation_key_name: gpu-validation # [string] Network to use when creating the VM diff --git a/gpu-validation/tasks/main.yaml b/gpu-validation/tasks/main.yaml index f2982ac..2ade4d5 100644 --- a/gpu-validation/tasks/main.yaml +++ b/gpu-validation/tasks/main.yaml @@ -10,6 +10,12 @@ - name: Reboot if system updates require it ansible.builtin.import_tasks: reboot_if_needed.yaml +- name: Install NVIDIA CUDA repos + ansible.builtin.import_tasks: nvidia-cuda-repos.yaml + when: + - ansible_distribution == "RedHat" + - gpu_validation_mode == "mig" + - name: Check GPUs ansible.builtin.import_tasks: gpus.yaml - name: GPUs Assertions # noqa: ignore-errors diff --git a/gpu-validation/tasks/nvidia-cuda-repos.yaml b/gpu-validation/tasks/nvidia-cuda-repos.yaml new file mode 100644 index 0000000..a235f0e --- /dev/null +++ b/gpu-validation/tasks/nvidia-cuda-repos.yaml @@ -0,0 +1,57 @@ +--- +- name: Add NVIDIA CUDA repo + become: true + ansible.builtin.yum_repository: + name: nvidia-cuda + description: nvidia cuda repo + baseurl: "{{ edpm_accel_drivers_nvidia_repo_url }}/$basearch/" + gpgcheck: true + gpgkey: "{{ edpm_accel_drivers_nvidia_repo_gpgkey }}" + +- name: Install nvidia-container-toolkit + become: true + ansible.builtin.dnf: + use_backend: dnf4 + name: nvidia-container-toolkit + state: present + +- name: Reboot the VM to find the installed drivers + become: true + ansible.builtin.reboot: + reboot_timeout: 600 + +- name: Check if CDI configfile exists + become: true + ansible.builtin.stat: + path: /etc/cdi/nvidia.yaml + register: nvidia_driver_cdi_config_file + +- name: Configure NVIDIA container runtime + when: not nvidia_driver_cdi_config_file.stat.exists + become: true + block: + - name: Ensure CDI directory exists + ansible.builtin.file: + path: /etc/cdi + state: directory + mode: "0755" + owner: root + + - name: Configure NVIDIA container runtime + ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd + changed_when: true + + - name: Generate NVIDIA CDI configuration + ansible.builtin.command: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + changed_when: true + +- name: Install NVIDIA Management Library + become: true + ansible.builtin.dnf: + use_backend: dnf4 + name: "{{ gpu_validation_libnvidia_ml_version }}" + state: present + +- name: Refresh package facts after driver installation + ansible.builtin.package_facts: + manager: rpm diff --git a/gpu-validation/tasks/nvidia.yaml b/gpu-validation/tasks/nvidia.yaml index 9b3bf05..2098373 100644 --- a/gpu-validation/tasks/nvidia.yaml +++ b/gpu-validation/tasks/nvidia.yaml @@ -21,3 +21,15 @@ when: - found_nvidia | default(false) - not nvidia_smi_output.failed + +- name: Run nvidia smi to get MIG device status + ansible.builtin.command: nvidia-smi --query-gpu uuid,pci.device_id,mig.mode.current --format=noheader + register: nvidia_smi_mig_mode_output + when: gpu_validation_mode == "mig" + changed_when: false + +- name: Run lsmod to get MIG NVIDIA modules status + ansible.builtin.command: lsmod + register: lsmod_mig_mode_output + when: gpu_validation_mode == "mig" + changed_when: false diff --git a/gpu-validation/tasks/nvidia_assertions.yaml b/gpu-validation/tasks/nvidia_assertions.yaml index 00a88f0..d152b2e 100644 --- a/gpu-validation/tasks/nvidia_assertions.yaml +++ b/gpu-validation/tasks/nvidia_assertions.yaml @@ -3,5 +3,16 @@ ansible.builtin.assert: that: nvidia_gpu_count.stdout|int > 0 fail_msg: No NVIDIA GPUs found in nvidia-smi command! - when: - - found_nvidia | default(false) + when: found_nvidia | default(false) + +- name: "TEST[nvidia mig] Check MIG mode slice is Enabled on the VM" + ansible.builtin.assert: + that: "'Enabled' in nvidia_smi_mig_mode_output.stdout" + fail_msg: nvidia-smi did not return any MIG enabled device on the VM. + when: gpu_validation_mode == "mig" + +- name: "TEST[nvidia mig] Check NVIDIA modules are present with MIG VM" + ansible.builtin.assert: + that: "'nvidia' in lsmod_mig_mode_output.stdout" + fail_msg: lsmod did not return any NVIDIA modules on a MIG enabled VM. + when: gpu_validation_mode == "mig" diff --git a/gpu-validation/tasks/setup.yaml b/gpu-validation/tasks/setup.yaml index ad4364d..d688700 100644 --- a/gpu-validation/tasks/setup.yaml +++ b/gpu-validation/tasks/setup.yaml @@ -6,6 +6,34 @@ when: gpu_validation_dns_server != "" changed_when: true +- name: Install needed driver dependency packages for RHEL + when: + - ansible_distribution == "RedHat" + - gpu_validation_mode == "mig" + block: + - name: Run repo-setup for RHEL VMs + ansible.builtin.include_role: + name: cifmw.general.repo_setup + tasks_from: rhos_release + + - name: Add EPEL repository for DKMS (UNSUPPORTED) + ansible.builtin.yum_repository: + name: epel + description: EPEL YUM repo + baseurl: "https://download.fedoraproject.org/pub/epel/{{ ansible_facts['distribution_major_version'] }}/Everything/$basearch/" + enabled: 1 + gpgcheck: 1 + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_facts['distribution_major_version'] }}" + + - name: Install DKMS (UNSUPPORTED) + ansible.builtin.dnf: + use_backend: dnf4 + name: + - dkms + - kernel-devel + - kernel-headers + state: present + - name: Install pciutils package ansible.builtin.dnf: use_backend: dnf4 diff --git a/gpu-validation/tasks/vm_image.yaml b/gpu-validation/tasks/vm_image.yaml index 83119e4..b182bed 100644 --- a/gpu-validation/tasks/vm_image.yaml +++ b/gpu-validation/tasks/vm_image.yaml @@ -26,16 +26,17 @@ ca_cert: "{{ gpu_validation_ca_cert_path }}" when: not _gpu_validation_image_exists +- name: Reset extra_specs based GPU mode + ansible.builtin.set_fact: + gpu_validation_extra_specs: + "resources:VGPU": "1" + when: gpu_validation_mode == "mig" + - name: Create flavor for GPU validation openstack.cloud.compute_flavor: name: "{{ gpu_validation_flavor_name }}" ram: "{{ gpu_validation_flavor_ram }}" vcpus: "{{ gpu_validation_flavor_vcpus }}" disk: "{{ gpu_validation_flavor_disk }}" - extra_specs: - "pci_passthrough:alias": "gpu-l4:{{ gpu_validation_flavor_gpus }}" - "hw:pci_numa_affinity_policy": "preferred" - "hw:hide_hypervisor_id": "true" - "hw:kvm_hidden": "true" - "hw:cpu_model": "host-passthrough" + extra_specs: "{{ gpu_validation_extra_specs }}" ca_cert: "{{ gpu_validation_ca_cert_path }}" diff --git a/gpu-validation/templates/vllm-serve.service.j2 b/gpu-validation/templates/vllm-serve.service.j2 index 6def836..cb90593 100644 --- a/gpu-validation/templates/vllm-serve.service.j2 +++ b/gpu-validation/templates/vllm-serve.service.j2 @@ -19,4 +19,7 @@ ExecStart=podman run \ -p 8000:8000 \ {{ gpu_validation_workload_container_image }} \ --model {{ gpu_validation_model_name | quote }} \ +{% if gpu_validation_mem_utilization %} + --gpu-memory-utilization {{ gpu_validation_mem_utilization }} \ +{% endif %} --tensor-parallel-size {{ gpu_validation_num_gpus }} diff --git a/requirements.yaml b/requirements.yaml index 4c7a74b..3c9cd1e 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -5,3 +5,4 @@ collections: - name: community.general version: ">=10.0.0" - name: git+https://github.com/openstack-k8s-operators/edpm-ansible.git + - name: git+https://github.com/openstack-k8s-operators/ci-framework.git From 4c525841d9b069e5474c83a649d6194fe9570b70 Mon Sep 17 00:00:00 2001 From: Ronelle Landy Date: Fri, 12 Jun 2026 13:37:18 +0300 Subject: [PATCH 2/2] Replace repo_setup with edpm-ansible bootstrap_command --- gpu-validation/tasks/setup.yaml | 6 +++--- requirements.yaml | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gpu-validation/tasks/setup.yaml b/gpu-validation/tasks/setup.yaml index d688700..a868697 100644 --- a/gpu-validation/tasks/setup.yaml +++ b/gpu-validation/tasks/setup.yaml @@ -11,10 +11,10 @@ - ansible_distribution == "RedHat" - gpu_validation_mode == "mig" block: - - name: Run repo-setup for RHEL VMs + - name: Execute bootstrap command - RHEL repo setup ansible.builtin.include_role: - name: cifmw.general.repo_setup - tasks_from: rhos_release + name: osp.edpm.edpm_bootstrap + tasks_from: bootstrap_command.yml - name: Add EPEL repository for DKMS (UNSUPPORTED) ansible.builtin.yum_repository: diff --git a/requirements.yaml b/requirements.yaml index 3c9cd1e..4c7a74b 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -5,4 +5,3 @@ collections: - name: community.general version: ">=10.0.0" - name: git+https://github.com/openstack-k8s-operators/edpm-ansible.git - - name: git+https://github.com/openstack-k8s-operators/ci-framework.git