|
1 | 1 |
|
2 | 2 | # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation |
3 | 3 |
|
4 | | -- name: Check for OFED |
| 4 | +- name: Check for OFED/DOCA |
5 | 5 | command: |
6 | 6 | cmd: dnf list --installed rdma-core |
7 | 7 | register: _dnf_rdma_core |
|
10 | 10 | - name: Assert OFED installed |
11 | 11 | assert: |
12 | 12 | that: "'mlnx' in _dnf_rdma_core.stdout" |
13 | | - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" |
| 13 | + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" |
14 | 14 |
|
15 | 15 | - name: Install cuda repo |
16 | 16 | get_url: |
17 | | - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" |
18 | | - url: "{{ cuda_repo }}" |
| 17 | + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" |
| 18 | + url: "{{ cuda_repo_url }}" |
19 | 19 |
|
20 | 20 | - name: Check if nvidia driver module is enabled |
21 | | - shell: |
22 | | - cmd: dnf module list --enabled nvidia-driver |
| 21 | + ansible.builtin.command: dnf module list --enabled nvidia-driver |
23 | 22 | changed_when: false |
24 | 23 | failed_when: false |
25 | 24 | register: _cuda_driver_module_enabled |
26 | 25 |
|
27 | 26 | - name: Enable nvidia driver module |
28 | | - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" |
| 27 | + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" |
29 | 28 | register: _cuda_driver_module_enable |
30 | 29 | when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" |
31 | 30 | changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" |
32 | 31 |
|
| 32 | +- name: Check if nvidia driver module is installed |
| 33 | + ansible.builtin.command: dnf module list --installed nvidia-driver |
| 34 | + changed_when: false |
| 35 | + failed_when: false |
| 36 | + register: _cuda_driver_module_installed |
| 37 | + |
33 | 38 | - name: Install nvidia drivers |
34 | 39 | ansible.builtin.command: dnf module install -y nvidia-driver |
35 | 40 | register: _cuda_driver_install |
36 | | - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" |
| 41 | + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" |
37 | 42 | changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" |
38 | 43 |
|
| 44 | +- name: Check kernel has not been modified |
| 45 | + assert: |
| 46 | + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched |
| 47 | + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" |
| 48 | + |
39 | 49 | - name: Install cuda packages |
40 | 50 | ansible.builtin.dnf: |
41 | 51 | name: "{{ cuda_packages }}" |
| 52 | + when: cuda_package_version != 'none' |
42 | 53 | register: cuda_package_install |
43 | 54 |
|
44 | 55 | - name: Add cuda binaries to path |
45 | 56 | lineinfile: |
46 | 57 | path: /etc/profile.d/sh.local |
47 | 58 | line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' |
| 59 | + when: cuda_package_version != 'none' |
48 | 60 |
|
49 | 61 | - name: Enable NVIDIA Persistence Daemon |
50 | 62 | systemd: |
|
60 | 72 | - name: Wait for hosts to be reachable |
61 | 73 | wait_for_connection: |
62 | 74 | sleep: 15 |
| 75 | + when: cuda_package_install.changed |
0 commit comments