Skip to content

Commit 68fd836

Browse files
authored
Merge pull request #1586 from stackhpc/pci-passthrough-defaults
Add defaults for GPU PCI passthrough
2 parents b8db321 + 9a9b4e8 commit 68fd836

File tree

9 files changed

+433
-265
lines changed

9 files changed

+433
-265
lines changed

doc/source/operations/gpu-in-openstack.rst

Lines changed: 134 additions & 264 deletions
Large diffs are not rendered by default.
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
---
2+
- name: Enable GPU passthough
3+
hosts: "{{ (gpu_group_map | default({})).keys() }}"
4+
vars:
5+
# This playbook will execute after nodes are deployed
6+
# and before overcloud host configure - we can't assume
7+
# users and venvs exist.
8+
ansible_user: "{{ bootstrap_user }}"
9+
ansible_ssh_common_args: "-o StrictHostKeyChecking=no"
10+
ansible_python_interpreter: "/usr/bin/python3"
11+
vfio_pci_ids: |-
12+
{% set gpu_list = [] %}
13+
{% set output = [] %}
14+
{% for gpu_group in gpu_group_map | dict2items | default([]) %}
15+
{% if gpu_group.key in group_names %}
16+
{% set _ = gpu_list.append(gpu_group.value) %}
17+
{% endif %}
18+
{% endfor %}
19+
{% for item in gpu_list | flatten | unique %}
20+
{% set _ = output.append(stackhpc_gpu_data[item]['vendor_id'] + ':' + stackhpc_gpu_data[item]['product_id']) %}
21+
{% endfor %}
22+
{{ output | join(',') }}
23+
reboot_timeout_s: "{{ 20 * 60 }}"
24+
tasks:
25+
- name: Template dracut config
26+
ansible.builtin.blockinfile:
27+
path: /etc/dracut.conf.d/gpu-vfio.conf
28+
block: |
29+
add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd"
30+
owner: root
31+
group: root
32+
mode: 0660
33+
create: true
34+
become: true
35+
notify:
36+
- Regenerate initramfs
37+
- reboot
38+
39+
- name: Add vfio to modules-load.d
40+
ansible.builtin.blockinfile:
41+
path: /etc/modules-load.d/vfio.conf
42+
block: |
43+
vfio
44+
vfio_iommu_type1
45+
vfio_pci
46+
vfio_virqfd
47+
owner: root
48+
group: root
49+
mode: 0664
50+
create: true
51+
become: true
52+
notify: reboot
53+
54+
- name: Blacklist nouveau
55+
ansible.builtin.blockinfile:
56+
path: /etc/modprobe.d/blacklist-nouveau.conf
57+
block: |
58+
blacklist nouveau
59+
options nouveau modeset=0
60+
mode: 0664
61+
owner: root
62+
group: root
63+
create: true
64+
become: true
65+
notify:
66+
- reboot
67+
- Regenerate initramfs
68+
69+
- name: Ignore unsupported model specific registers
70+
# Occasionally, applications running in the VM may crash unexpectedly,
71+
# whereas they would run normally on a physical machine. If, while
72+
# running dmesg -wH, you encounter an error mentioning MSR, the reason
73+
# for those crashes is that KVM injects a General protection fault (GPF)
74+
# when the guest tries to access unsupported Model-specific registers
75+
# (MSRs) - this often results in guest applications/OS crashing. A
76+
# number of those issues can be solved by passing the ignore_msrs=1
77+
# option to the KVM module, which will ignore unimplemented MSRs.
78+
# source: https://wiki.archlinux.org/index.php/QEMU
79+
ansible.builtin.blockinfile:
80+
path: /etc/modprobe.d/kvm.conf
81+
block: |
82+
options kvm ignore_msrs=Y
83+
# This option is not available in centos 7 as the kernel is too old,
84+
# but it can help with dmesg spam in newer kernels (centos8?). Sample
85+
# dmesg log message:
86+
# [ +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619
87+
# options kvm report_ignored_msrs=N
88+
mode: 0664
89+
owner: root
90+
group: root
91+
create: true
92+
become: true
93+
notify: reboot
94+
95+
- name: Add vfio-pci.ids kernel args
96+
ansible.builtin.include_role:
97+
name: stackhpc.linux.grubcmdline
98+
vars:
99+
kernel_cmdline:
100+
- intel_iommu=on
101+
- iommu=pt
102+
- "vfio-pci.ids={{ vfio_pci_ids }}"
103+
kernel_cmdline_remove:
104+
- iommu
105+
- intel_iommu
106+
- vfio-pci.ids
107+
108+
handlers:
109+
- name: Regenerate initramfs (RedHat)
110+
listen: Regenerate initramfs
111+
ansible.builtin.shell: |-
112+
#!/bin/bash
113+
set -eux
114+
dracut -v -f /boot/initramfs-$(uname -r).img $(uname -r)
115+
become: true
116+
changed_when: true
117+
when: ansible_facts.os_family == 'RedHat'
118+
119+
- name: Regenerate initramfs (Debian)
120+
listen: Regenerate initramfs
121+
ansible.builtin.shell: |-
122+
#!/bin/bash
123+
set -eux
124+
update-initramfs -u -k $(uname -r)
125+
become: true
126+
changed_when: true
127+
when: ansible_facts.os_family == 'Debian'
128+
129+
- name: Reboot
130+
listen: reboot
131+
become: true
132+
ansible.builtin.reboot:
133+
reboot_timeout: "{{ reboot_timeout_s }}"
134+
search_paths:
135+
# Systems running molly-guard hang waiting for confirmation before rebooting without this.
136+
- /lib/molly-guard
137+
# Default list:
138+
- /sbin
139+
- /bin
140+
- /usr/sbin
141+
- /usr/bin
142+
- /usr/local/sbin
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../ansible/pci-passthrough.yml

etc/kayobe/kolla.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,24 @@ kolla_build_args: {}
485485
# * groups: A list of kayobe ansible groups to map to this kolla-ansible group.
486486
# * vars: A dict mapping variable names to values for hosts in this
487487
# kolla-ansible group.
488+
# NOTE(Alex-Welsh): If you want to extend the map rather than replace it, you
489+
# must include the Kayobe defaults in the mapping.
490+
# Standard Kayobe defaults:
491+
# compute:
492+
# groups:
493+
# - "compute"
494+
# control:
495+
# groups:
496+
# - "controllers"
497+
# monitoring:
498+
# groups:
499+
# - "controllers"
500+
# network:
501+
# groups:
502+
# - "controllers"
503+
# storage:
504+
# groups:
505+
# - "controllers"
488506
#kolla_overcloud_inventory_top_level_group_map:
489507

490508
# List of names of top level kolla-ansible groups. Any of these groups which
@@ -499,7 +517,9 @@ kolla_build_args: {}
499517
# List of names of additional host variables to pass through from kayobe hosts
500518
# to kolla-ansible hosts, if set. See also
501519
# kolla_overcloud_inventory_pass_through_host_vars_map.
502-
#kolla_overcloud_inventory_pass_through_host_vars_extra:
520+
kolla_overcloud_inventory_pass_through_host_vars_extra:
521+
- stackhpc_gpu_data
522+
- gpu_group_map
503523

504524
# List of names of host variables to pass through from kayobe hosts to
505525
# kolla-ansible hosts, if set. See also
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[pci]
2+
{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %}
3+
alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
4+
{% endfor %}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[pci]
2+
{% raw %}
3+
{% set gpu_list = [] %}
4+
{% for gpu_group in gpu_group_map | dict2items | default([]) %}
5+
{% if gpu_group.key in group_names %}
6+
{% set _ = gpu_list.append(gpu_group.value) %}
7+
{% endif %}
8+
{% endfor %}
9+
{% for item in gpu_list | flatten | unique %}
10+
device_spec = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}" }
11+
alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
12+
{% endfor %}
13+
{% endraw %}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[filter_scheduler]
2+
# Default list plus PciPassthroughFilter
3+
# NOTE(Upgrade): defaults may change in each release. Default values can be
4+
# checked here:
5+
# https://docs.openstack.org/nova/latest/configuration/sample-config.html
6+
enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
7+
available_filters = nova.scheduler.filters.all_filters

etc/kayobe/stackhpc-compute.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
---
2+
# StackHPC compute node configuration
3+
4+
# Map of inventory groups to GPU types.
5+
# This is used to determine which GPU types each compute node should pass
6+
# through to OpenStack.
7+
# Keys are group names, values are a list of GPU types.
8+
# Groups must be added to kolla_overcloud_inventory_top_level_group_map
9+
# GPU types must be keys in stackhpc_gpu_data.
10+
# Example GPU group map:
11+
# gpu_group_map:
12+
# compute_a100:
13+
# - a100_80
14+
# compute_v100:
15+
# - v100_32
16+
# compute_multi_gpu:
17+
# - a100_80
18+
# - v100_32
19+
gpu_group_map: {}
20+
21+
# Dict mapping GPUs to PCI data.
22+
# Resource names are used to identify the device in placement, and can be
23+
# edited to match deployment-specific naming conventions
24+
# The default list covers many common GPUs, but can be extended as needed.
25+
stackhpc_gpu_data:
26+
# Nvidia H100 SXM5 80GB
27+
h100_80_sxm:
28+
resource_name: "{{ h100_80_sxm_resource_name | default('h100_80_sxm')}}"
29+
vendor_id: "10de"
30+
product_id: "2330"
31+
device_type: "type-PF"
32+
# Nvidia A100 SXM5 80GB
33+
a100_80_sxm:
34+
resource_name: "{{ a100_80_sxm_resource_name | default('a100_80_sxm')}}"
35+
vendor_id: "10de"
36+
product_id: "20b2"
37+
device_type: "type-PF"
38+
# Nvidia A100 SXM5 40GB
39+
a100_40_sxm:
40+
resource_name: "{{ a100_40_sxm_resource_name | default('a100_40_sxm')}}"
41+
vendor_id: "10de"
42+
product_id: "20b0"
43+
device_type: "type-PF"
44+
# Nvidia A100 PCI 80GB
45+
a100_80:
46+
resource_name: "{{ a100_80_resource_name | default('a100_80')}}"
47+
vendor_id: "10de"
48+
product_id: "20b5"
49+
device_type: "type-PF"
50+
# Nvidia A100 PCI 40GB
51+
a100_40:
52+
resource_name: "{{ a100_40_resource_name | default('a100_40')}}"
53+
vendor_id: "10de"
54+
product_id: "20f1"
55+
device_type: "type-PF"
56+
# Nvidia V100 SXM3 32GB
57+
v100_32_sxm3:
58+
resource_name: "{{ v100_32_sxm3_resource_name | default('v100_32_sxm3')}}"
59+
vendor_id: "10de"
60+
product_id: "1db8"
61+
device_type: "type-PCI"
62+
# Nvidia V100 SXM2 32GB
63+
v100_32_sxm2:
64+
resource_name: "{{ v100_32_sxm2_resource_name | default('v100_32_sxm2')}}"
65+
vendor_id: "10de"
66+
product_id: "1db5"
67+
device_type: "type-PCI"
68+
# Nvidia V100 PCI 32GB
69+
v100_32:
70+
resource_name: "{{ v100_32_resource_name | default('v100_32')}}"
71+
vendor_id: "10de"
72+
product_id: "1db6"
73+
device_type: "type-PCI"
74+
# Nvidia RTX A6000
75+
a6000:
76+
resource_name: "{{ a6000_resource_name | default('a6000')}}"
77+
vendor_id: "10de"
78+
product_id: "2230"
79+
device_type: "type-PCI"
80+
# Nvidia A40
81+
a40:
82+
resource_name: "{{ a40_resource_name | default('a40')}}"
83+
vendor_id: "10de"
84+
product_id: "2235"
85+
device_type: "type-PF"
86+
# Nvidia T4
87+
t4:
88+
resource_name: "{{ t4_resource_name | default('t4')}}"
89+
vendor_id: "10de"
90+
product_id: "1eb8"
91+
device_type: "type-PF"
92+
# Nvidia L40
93+
l40:
94+
resource_name: "{{ l40_resource_name | default('l40')}}"
95+
vendor_id: "10de"
96+
product_id: "26b5"
97+
device_type: "type-PF"
98+
# Nvidia L40s
99+
l40s:
100+
resource_name: "{{ l40s_resource_name | default('l40s')}}"
101+
vendor_id: "10de"
102+
product_id: "26b9"
103+
device_type: "type-PF"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
features:
3+
- |
4+
Added templates and a playbook to simplify configuration of PCI passthrough
5+
GPUs. GPU types can be mapped to inventory groups with the
6+
``gpu_group_map`` variable, which will configure the host and Nova
7+
automatically. A list of supported GPUs can be found in
8+
``etc/kayobe/stackhpc-compute.yml`` under ``stackhpc_gpu_data``.

0 commit comments

Comments
 (0)