diff --git a/gpu-validation/defaults/main.yaml b/gpu-validation/defaults/main.yaml index 8e679fe..1b27ff7 100644 --- a/gpu-validation/defaults/main.yaml +++ b/gpu-validation/defaults/main.yaml @@ -1,6 +1,8 @@ --- # [bool] Whether to deploy a VM before running tests gpu_validation_enabled: true +# [string] Mode of running the GPU-enabled device +gpu_validation_mode: pci_passthrough # [string] Name of the VM to create and destroy gpu_validation_vm_name: gpu-validation # [string] URL of the image to use when creating the VM @@ -17,9 +19,23 @@ gpu_validation_flavor_vcpus: 4 gpu_validation_flavor_disk: 80 # [string] Number of GPUs for the flavor gpu_validation_flavor_gpus: 1 +# [string] PCI alias name for GPU device assignment +gpu_validation_pci_alias: gpu-l4 + # [string] Time to wait for VM to become created, seconds gpu_validation_server_timeout: 180 +# [list] Extra specs for the flavor: +gpu_validation_extra_specs: + "pci_passthrough:alias": "{{ gpu_validation_pci_alias }}:{{ gpu_validation_flavor_gpus }}" + "hw:pci_numa_affinity_policy": "preferred" + "hw:hide_hypervisor_id": "true" + "hw:kvm_hidden": "true" + "hw:cpu_model": "host-passthrough" + +# [string] NVIDIA management library version +gpu_validation_libnvidia_ml_version: libnvidia-ml + # [string] Keypair to use when creating the VM gpu_validation_key_name: gpu-validation # [string] Network to use when creating the VM diff --git a/gpu-validation/tasks/main.yaml b/gpu-validation/tasks/main.yaml index f2982ac..2ade4d5 100644 --- a/gpu-validation/tasks/main.yaml +++ b/gpu-validation/tasks/main.yaml @@ -10,6 +10,12 @@ - name: Reboot if system updates require it ansible.builtin.import_tasks: reboot_if_needed.yaml +- name: Install NVIDIA CUDA repos + ansible.builtin.import_tasks: nvidia-cuda-repos.yaml + when: + - ansible_distribution == "RedHat" + - gpu_validation_mode == "mig" + - name: Check GPUs ansible.builtin.import_tasks: gpus.yaml - name: GPUs Assertions # noqa: ignore-errors diff --git a/gpu-validation/tasks/nvidia-cuda-repos.yaml b/gpu-validation/tasks/nvidia-cuda-repos.yaml new file mode 100644 index 0000000..a235f0e --- /dev/null +++ b/gpu-validation/tasks/nvidia-cuda-repos.yaml @@ -0,0 +1,57 @@ +--- +- name: Add NVIDIA CUDA repo + become: true + ansible.builtin.yum_repository: + name: nvidia-cuda + description: nvidia cuda repo + baseurl: "{{ edpm_accel_drivers_nvidia_repo_url }}/$basearch/" + gpgcheck: true + gpgkey: "{{ edpm_accel_drivers_nvidia_repo_gpgkey }}" + +- name: Install nvidia-container-toolkit + become: true + ansible.builtin.dnf: + use_backend: dnf4 + name: nvidia-container-toolkit + state: present + +- name: Reboot the VM to find the installed drivers + become: true + ansible.builtin.reboot: + reboot_timeout: 600 + +- name: Check if CDI configfile exists + become: true + ansible.builtin.stat: + path: /etc/cdi/nvidia.yaml + register: nvidia_driver_cdi_config_file + +- name: Configure NVIDIA container runtime + when: not nvidia_driver_cdi_config_file.stat.exists + become: true + block: + - name: Ensure CDI directory exists + ansible.builtin.file: + path: /etc/cdi + state: directory + mode: "0755" + owner: root + + - name: Configure NVIDIA container runtime + ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd + changed_when: true + + - name: Generate NVIDIA CDI configuration + ansible.builtin.command: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + changed_when: true + +- name: Install NVIDIA Management Library + become: true + ansible.builtin.dnf: + use_backend: dnf4 + name: "{{ gpu_validation_libnvidia_ml_version }}" + state: present + +- name: Refresh package facts after driver installation + ansible.builtin.package_facts: + manager: rpm diff --git a/gpu-validation/tasks/nvidia.yaml b/gpu-validation/tasks/nvidia.yaml index 9b3bf05..2098373 100644 --- a/gpu-validation/tasks/nvidia.yaml +++ b/gpu-validation/tasks/nvidia.yaml @@ -21,3 +21,15 @@ when: - found_nvidia | default(false) - not nvidia_smi_output.failed + +- name: Run nvidia smi to get MIG device status + ansible.builtin.command: nvidia-smi --query-gpu uuid,pci.device_id,mig.mode.current --format=noheader + register: nvidia_smi_mig_mode_output + when: gpu_validation_mode == "mig" + changed_when: false + +- name: Run lsmod to get MIG NVIDIA modules status + ansible.builtin.command: lsmod + register: lsmod_mig_mode_output + when: gpu_validation_mode == "mig" + changed_when: false diff --git a/gpu-validation/tasks/nvidia_assertions.yaml b/gpu-validation/tasks/nvidia_assertions.yaml index 00a88f0..d152b2e 100644 --- a/gpu-validation/tasks/nvidia_assertions.yaml +++ b/gpu-validation/tasks/nvidia_assertions.yaml @@ -3,5 +3,16 @@ ansible.builtin.assert: that: nvidia_gpu_count.stdout|int > 0 fail_msg: No NVIDIA GPUs found in nvidia-smi command! - when: - - found_nvidia | default(false) + when: found_nvidia | default(false) + +- name: "TEST[nvidia mig] Check MIG mode slice is Enabled on the VM" + ansible.builtin.assert: + that: "'Enabled' in nvidia_smi_mig_mode_output.stdout" + fail_msg: nvidia-smi did not return any MIG enabled device on the VM. + when: gpu_validation_mode == "mig" + +- name: "TEST[nvidia mig] Check NVIDIA modules are present with MIG VM" + ansible.builtin.assert: + that: "'nvidia' in lsmod_mig_mode_output.stdout" + fail_msg: lsmod did not return any NVIDIA modules on a MIG enabled VM. + when: gpu_validation_mode == "mig" diff --git a/gpu-validation/tasks/setup.yaml b/gpu-validation/tasks/setup.yaml index ad4364d..a868697 100644 --- a/gpu-validation/tasks/setup.yaml +++ b/gpu-validation/tasks/setup.yaml @@ -6,6 +6,34 @@ when: gpu_validation_dns_server != "" changed_when: true +- name: Install needed driver dependency packages for RHEL + when: + - ansible_distribution == "RedHat" + - gpu_validation_mode == "mig" + block: + - name: Execute bootstrap command - RHEL repo setup + ansible.builtin.include_role: + name: osp.edpm.edpm_bootstrap + tasks_from: bootstrap_command.yml + + - name: Add EPEL repository for DKMS (UNSUPPORTED) + ansible.builtin.yum_repository: + name: epel + description: EPEL YUM repo + baseurl: "https://download.fedoraproject.org/pub/epel/{{ ansible_facts['distribution_major_version'] }}/Everything/$basearch/" + enabled: 1 + gpgcheck: 1 + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_facts['distribution_major_version'] }}" + + - name: Install DKMS (UNSUPPORTED) + ansible.builtin.dnf: + use_backend: dnf4 + name: + - dkms + - kernel-devel + - kernel-headers + state: present + - name: Install pciutils package ansible.builtin.dnf: use_backend: dnf4 diff --git a/gpu-validation/tasks/vm_image.yaml b/gpu-validation/tasks/vm_image.yaml index 83119e4..b182bed 100644 --- a/gpu-validation/tasks/vm_image.yaml +++ b/gpu-validation/tasks/vm_image.yaml @@ -26,16 +26,17 @@ ca_cert: "{{ gpu_validation_ca_cert_path }}" when: not _gpu_validation_image_exists +- name: Reset extra_specs based GPU mode + ansible.builtin.set_fact: + gpu_validation_extra_specs: + "resources:VGPU": "1" + when: gpu_validation_mode == "mig" + - name: Create flavor for GPU validation openstack.cloud.compute_flavor: name: "{{ gpu_validation_flavor_name }}" ram: "{{ gpu_validation_flavor_ram }}" vcpus: "{{ gpu_validation_flavor_vcpus }}" disk: "{{ gpu_validation_flavor_disk }}" - extra_specs: - "pci_passthrough:alias": "gpu-l4:{{ gpu_validation_flavor_gpus }}" - "hw:pci_numa_affinity_policy": "preferred" - "hw:hide_hypervisor_id": "true" - "hw:kvm_hidden": "true" - "hw:cpu_model": "host-passthrough" + extra_specs: "{{ gpu_validation_extra_specs }}" ca_cert: "{{ gpu_validation_ca_cert_path }}" diff --git a/gpu-validation/templates/vllm-serve.service.j2 b/gpu-validation/templates/vllm-serve.service.j2 index 6def836..cb90593 100644 --- a/gpu-validation/templates/vllm-serve.service.j2 +++ b/gpu-validation/templates/vllm-serve.service.j2 @@ -19,4 +19,7 @@ ExecStart=podman run \ -p 8000:8000 \ {{ gpu_validation_workload_container_image }} \ --model {{ gpu_validation_model_name | quote }} \ +{% if gpu_validation_mem_utilization %} + --gpu-memory-utilization {{ gpu_validation_mem_utilization }} \ +{% endif %} --tensor-parallel-size {{ gpu_validation_num_gpus }}