Merge pull request #1586 from stackhpc/pci-passthrough-defaults

Alex-Welsh · web-flow · commit 68fd8367345b · 2025-04-17T14:30:53.000+01:00
Add defaults for GPU PCI passthrough
diff --git a/doc/source/operations/gpu-in-openstack.rst b/doc/source/operations/gpu-in-openstack.rst
diff --git a/etc/kayobe/ansible/pci-passthrough.yml b/etc/kayobe/ansible/pci-passthrough.yml
@@ -0,0 +1,142 @@
+---
+- name: Enable GPU passthough
+  hosts: "{{ (gpu_group_map | default({})).keys() }}"
+  vars:
+    # This playbook will execute after nodes are deployed
+    # and before overcloud host configure - we can't assume
+    # users and venvs exist.
+    ansible_user: "{{ bootstrap_user }}"
+    ansible_ssh_common_args: "-o StrictHostKeyChecking=no"
+    ansible_python_interpreter: "/usr/bin/python3"
+    vfio_pci_ids: |-
+      {% set gpu_list = [] %}
+      {% set output = [] %}
+      {% for gpu_group in gpu_group_map | dict2items | default([]) %}
+      {% if gpu_group.key in group_names %}
+      {% set _ = gpu_list.append(gpu_group.value) %}
+      {% endif %}
+      {% endfor %}
+      {% for item in gpu_list | flatten | unique %}
+      {% set _ = output.append(stackhpc_gpu_data[item]['vendor_id'] + ':' + stackhpc_gpu_data[item]['product_id']) %}
+      {% endfor %}
+      {{ output | join(',') }}
+    reboot_timeout_s: "{{ 20 * 60 }}"
+  tasks:
+    - name: Template dracut config
+      ansible.builtin.blockinfile:
+        path: /etc/dracut.conf.d/gpu-vfio.conf
+        block: |
+          add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd"
+        owner: root
+        group: root
+        mode: 0660
+        create: true
+      become: true
+      notify:
+        - Regenerate initramfs
+        - reboot
+
+    - name: Add vfio to modules-load.d
+      ansible.builtin.blockinfile:
+        path: /etc/modules-load.d/vfio.conf
+        block: |
+          vfio
+          vfio_iommu_type1
+          vfio_pci
+          vfio_virqfd
+        owner: root
+        group: root
+        mode: 0664
+        create: true
+      become: true
+      notify: reboot
+
+    - name: Blacklist nouveau
+      ansible.builtin.blockinfile:
+        path: /etc/modprobe.d/blacklist-nouveau.conf
+        block: |
+          blacklist nouveau
+          options nouveau modeset=0
+        mode: 0664
+        owner: root
+        group: root
+        create: true
+      become: true
+      notify:
+        - reboot
+        - Regenerate initramfs
+
+    - name: Ignore unsupported model specific registers
+      # Occasionally, applications running in the VM may crash unexpectedly,
+      # whereas they would run normally on a physical machine. If, while
+      # running dmesg -wH, you encounter an error mentioning MSR, the reason
+      # for those crashes is that KVM injects a General protection fault (GPF)
+      # when the guest tries to access unsupported Model-specific registers
+      # (MSRs) - this often results in guest applications/OS crashing. A
+      # number of those issues can be solved by passing the ignore_msrs=1
+      # option to the KVM module, which will ignore unimplemented MSRs.
+      # source: https://wiki.archlinux.org/index.php/QEMU
+      ansible.builtin.blockinfile:
+        path: /etc/modprobe.d/kvm.conf
+        block: |
+          options kvm ignore_msrs=Y
+          # This option is not available in centos 7 as the kernel is too old,
+          # but it can help with dmesg spam in newer kernels (centos8?). Sample
+          # dmesg log message:
+          #  [  +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619
+          # options kvm report_ignored_msrs=N
+        mode: 0664
+        owner: root
+        group: root
+        create: true
+      become: true
+      notify: reboot
+
+    - name: Add vfio-pci.ids kernel args
+      ansible.builtin.include_role:
+        name: stackhpc.linux.grubcmdline
+      vars:
+        kernel_cmdline:
+          - intel_iommu=on
+          - iommu=pt
+          - "vfio-pci.ids={{ vfio_pci_ids }}"
+        kernel_cmdline_remove:
+          - iommu
+          - intel_iommu
+          - vfio-pci.ids
+
+  handlers:
+    - name: Regenerate initramfs (RedHat)
+      listen: Regenerate initramfs
+      ansible.builtin.shell: |-
+        #!/bin/bash
+        set -eux
+        dracut -v -f /boot/initramfs-$(uname -r).img $(uname -r)
+      become: true
+      changed_when: true
+      when: ansible_facts.os_family == 'RedHat'
+
+    - name: Regenerate initramfs (Debian)
+      listen: Regenerate initramfs
+      ansible.builtin.shell: |-
+        #!/bin/bash
+        set -eux
+        update-initramfs -u -k $(uname -r)
+      become: true
+      changed_when: true
+      when: ansible_facts.os_family == 'Debian'
+
+    - name: Reboot
+      listen: reboot
+      become: true
+      ansible.builtin.reboot:
+        reboot_timeout: "{{ reboot_timeout_s }}"
+        search_paths:
+          # Systems running molly-guard hang waiting for confirmation before rebooting without this.
+          - /lib/molly-guard
+          # Default list:
+          - /sbin
+          - /bin
+          - /usr/sbin
+          - /usr/bin
+          - /usr/local/sbin
diff --git a/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml b/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml
@@ -0,0 +1 @@
+../../../ansible/pci-passthrough.yml
diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml
@@ -485,6 +485,24 @@ kolla_build_args: {}
 # * groups: A list of kayobe ansible groups to map to this kolla-ansible group.
 # * vars: A dict mapping variable names to values for hosts in this
 #         kolla-ansible group.
+# NOTE(Alex-Welsh): If you want to extend the map rather than replace it, you
+# must include the Kayobe defaults in the mapping.
+# Standard Kayobe defaults:
+#  compute:
+#    groups:
+#      - "compute"
+#  control:
+#    groups:
+#      - "controllers"
+#  monitoring:
+#    groups:
+#      - "controllers"
+#  network:
+#    groups:
+#      - "controllers"
+#  storage:
+#    groups:
+#      - "controllers"
 #kolla_overcloud_inventory_top_level_group_map:
 
 # List of names of top level kolla-ansible groups. Any of these groups which
@@ -499,7 +517,9 @@ kolla_build_args: {}
 # List of names of additional host variables to pass through from kayobe hosts
 # to kolla-ansible hosts, if set. See also
 # kolla_overcloud_inventory_pass_through_host_vars_map.
-#kolla_overcloud_inventory_pass_through_host_vars_extra:
+kolla_overcloud_inventory_pass_through_host_vars_extra:
+  - stackhpc_gpu_data
+  - gpu_group_map
 
 # List of names of host variables to pass through from kayobe hosts to
 # kolla-ansible hosts, if set. See also
diff --git a/etc/kayobe/kolla/config/nova/nova-api.conf b/etc/kayobe/kolla/config/nova/nova-api.conf
@@ -0,0 +1,4 @@
+[pci]
+{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %}
+alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
+{% endfor %}
diff --git a/etc/kayobe/kolla/config/nova/nova-compute.conf b/etc/kayobe/kolla/config/nova/nova-compute.conf
@@ -0,0 +1,13 @@
+[pci]
+{% raw %}
+{% set gpu_list = [] %}
+{% for gpu_group in gpu_group_map | dict2items | default([]) %}
+{% if gpu_group.key in group_names %}
+{% set _ = gpu_list.append(gpu_group.value) %}
+{% endif %}
+{% endfor %}
+{% for item in gpu_list | flatten | unique %}
+device_spec = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}" }
+alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
+{% endfor %}
+{% endraw %}
diff --git a/etc/kayobe/kolla/config/nova/nova-scheduler.conf b/etc/kayobe/kolla/config/nova/nova-scheduler.conf
@@ -0,0 +1,7 @@
+[filter_scheduler]
+# Default list plus PciPassthroughFilter
+# NOTE(Upgrade): defaults may change in each release. Default values can be
+# checked here:
+# https://docs.openstack.org/nova/latest/configuration/sample-config.html
+enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
+available_filters = nova.scheduler.filters.all_filters
diff --git a/etc/kayobe/stackhpc-compute.yml b/etc/kayobe/stackhpc-compute.yml
@@ -0,0 +1,103 @@
+---
+# StackHPC compute node configuration
+
+# Map of inventory groups to GPU types.
+# This is used to determine which GPU types each compute node should pass
+# through to OpenStack.
+# Keys are group names, values are a list of GPU types.
+# Groups must be added to kolla_overcloud_inventory_top_level_group_map
+# GPU types must be keys in stackhpc_gpu_data.
+# Example GPU group map:
+# gpu_group_map:
+#   compute_a100:
+#     - a100_80
+#   compute_v100:
+#     - v100_32
+#   compute_multi_gpu:
+#     - a100_80
+#     - v100_32
+gpu_group_map: {}
+
+# Dict mapping GPUs to PCI data.
+# Resource names are used to identify the device in placement, and can be
+# edited to match deployment-specific naming conventions
+# The default list covers many common GPUs, but can be extended as needed.
+stackhpc_gpu_data:
+  # Nvidia H100 SXM5 80GB
+  h100_80_sxm:
+    resource_name: "{{ h100_80_sxm_resource_name | default('h100_80_sxm')}}"
+    vendor_id: "10de"
+    product_id: "2330"
+    device_type: "type-PF"
+  # Nvidia A100 SXM5 80GB
+  a100_80_sxm:
+    resource_name: "{{ a100_80_sxm_resource_name | default('a100_80_sxm')}}"
+    vendor_id: "10de"
+    product_id: "20b2"
+    device_type: "type-PF"
+  # Nvidia A100 SXM5 40GB
+  a100_40_sxm:
+    resource_name: "{{ a100_40_sxm_resource_name | default('a100_40_sxm')}}"
+    vendor_id: "10de"
+    product_id: "20b0"
+    device_type: "type-PF"
+  # Nvidia A100 PCI 80GB
+  a100_80:
+    resource_name: "{{ a100_80_resource_name | default('a100_80')}}"
+    vendor_id: "10de"
+    product_id: "20b5"
+    device_type: "type-PF"
+  # Nvidia A100 PCI 40GB
+  a100_40:
+    resource_name: "{{ a100_40_resource_name | default('a100_40')}}"
+    vendor_id: "10de"
+    product_id: "20f1"
+    device_type: "type-PF"
+  # Nvidia V100 SXM3 32GB
+  v100_32_sxm3:
+    resource_name: "{{ v100_32_sxm3_resource_name | default('v100_32_sxm3')}}"
+    vendor_id: "10de"
+    product_id: "1db8"
+    device_type: "type-PCI"
+  # Nvidia V100 SXM2 32GB
+  v100_32_sxm2:
+    resource_name: "{{ v100_32_sxm2_resource_name | default('v100_32_sxm2')}}"
+    vendor_id: "10de"
+    product_id: "1db5"
+    device_type: "type-PCI"
+  # Nvidia V100 PCI 32GB
+  v100_32:
+    resource_name: "{{ v100_32_resource_name | default('v100_32')}}"
+    vendor_id: "10de"
+    product_id: "1db6"
+    device_type: "type-PCI"
+  # Nvidia RTX A6000
+  a6000:
+    resource_name: "{{ a6000_resource_name | default('a6000')}}"
+    vendor_id: "10de"
+    product_id: "2230"
+    device_type: "type-PCI"
+  # Nvidia A40
+  a40:
+    resource_name: "{{ a40_resource_name | default('a40')}}"
+    vendor_id: "10de"
+    product_id: "2235"
+    device_type: "type-PF"
+  # Nvidia T4
+  t4:
+    resource_name: "{{ t4_resource_name | default('t4')}}"
+    vendor_id: "10de"
+    product_id: "1eb8"
+    device_type: "type-PF"
+  # Nvidia L40
+  l40:
+    resource_name: "{{ l40_resource_name | default('l40')}}"
+    vendor_id: "10de"
+    product_id: "26b5"
+    device_type: "type-PF"
+  # Nvidia L40s
+  l40s:
+    resource_name: "{{ l40s_resource_name | default('l40s')}}"
+    vendor_id: "10de"
+    product_id: "26b9"
+    device_type: "type-PF"
diff --git a/releasenotes/notes/pci-passthrough-support-0c7e62585aaf2c23.yaml b/releasenotes/notes/pci-passthrough-support-0c7e62585aaf2c23.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Added templates and a playbook to simplify configuration of PCI passthrough
+    GPUs. GPU types can be mapped to inventory groups with the
+    ``gpu_group_map`` variable, which will configure the host and Nova
+    automatically. A list of supported GPUs can be found in
+    ``etc/kayobe/stackhpc-compute.yml`` under ``stackhpc_gpu_data``.