Skip to content

Commit 369f715

Browse files
committed
resources: GPUFS: ROCm 7.0 and MI350 support
1 parent 048dd88 commit 369f715

File tree

5 files changed

+29
-12
lines changed

5 files changed

+29
-12
lines changed

src/x86-ubuntu-gpu-ml/build.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ fi
1414
# Install the needed plugins
1515
./packer init x86-ubuntu-gpu-ml.pkr.hcl
1616

17-
# Build the image
18-
./packer build x86-ubuntu-gpu-ml.pkr.hcl
17+
# Build the image - Pass command line options from this script to build
18+
# command. This can be used to set variable such as qemu path.
19+
./packer build "$@" x86-ubuntu-gpu-ml.pkr.hcl

src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ insmod /lib/modules/`uname -r`/updates/dkms/amdkcl.ko.zst
2424
insmod /lib/modules/`uname -r`/updates/dkms/amd-sched.ko.zst
2525
insmod /lib/modules/`uname -r`/updates/dkms/amdxcp.ko.zst
2626
insmod /lib/modules/`uname -r`/updates/dkms/amddrm_buddy.ko.zst
27+
insmod /lib/modules/`uname -r`/updates/dkms/amddrm_exec.ko.zst
2728
insmod /lib/modules/`uname -r`/updates/dkms/amdttm.ko.zst
2829
insmod /lib/modules/`uname -r`/updates/dkms/amddrm_ttm_helper.ko.zst
2930

8.02 KB
Binary file not shown.

src/x86-ubuntu-gpu-ml/scripts/rocm-install.sh

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ sudo mkdir --parents --mode=0755 /etc/apt/keyrings
6060
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
6161
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
6262

63-
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.4/ubuntu noble main" \
63+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/7.0/ubuntu noble main" \
6464
| sudo tee /etc/apt/sources.list.d/amdgpu.list
6565

66-
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 noble main" \
66+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0 noble main" \
6767
| sudo tee --append /etc/apt/sources.list.d/rocm.list
6868
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
6969
| sudo tee /etc/apt/preferences.d/rocm-pin-600
@@ -86,8 +86,8 @@ sudo chmod 777 /usr/lib/firmware/amdgpu/ip_discovery.bin
8686

8787

8888
# Install a known-working version of Linux as this might change after stable
89-
# release. Installl this after DKMS so they are rebuilt.
90-
KERNEL=6.8.0-60-generic
89+
# release. Install this after DKMS so they are rebuilt.
90+
KERNEL=6.8.0-79-generic
9191

9292
sudo apt -y install "linux-image-${KERNEL}"
9393
sudo apt -y install "linux-headers-${KERNEL}" "linux-modules-extra-${KERNEL}"
@@ -105,6 +105,13 @@ if [ ! -f ./gem5_wmi.ko ]; then
105105
fi
106106
popd
107107

108+
# Make the discovery files writeable by packer
109+
touch /usr/lib/firmware/amdgpu/mi300_discovery
110+
touch /usr/lib/firmware/amdgpu/mi350_discovery
111+
112+
chmod 777 /usr/lib/firmware/amdgpu/mi300_discovery
113+
chmod 777 /usr/lib/firmware/amdgpu/mi350_discovery
114+
108115
# Note about pip: This disk is created for the express purpose of being run in
109116
# gem5 and is therefore effectively sandboxed enough that we can use the pip
110117
# option --break-system-packages. If you plan to modify this disk image with
@@ -121,11 +128,9 @@ pip3 install --break-system-packages torch torchvision torchaudio --index-url ht
121128

122129
# For a newer version uncomment one below and remove the above install:
123130
# Warning: Absurdly slow compared to ROCm 6.0 *in simulation*:
124-
#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4
125-
# Warning: Missing python module torch.sparse.......:
126-
#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3
131+
#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4
127132
# Warning: nightly build, may not work depending on day. Use at your own risk:
128-
#pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4/ --break-system-packages
133+
#pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0 --break-system-packages
129134

130135
# Setup gem5 auto login.
131136
mv /home/gem5/serial-getty@.service /lib/systemd/system/

src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ variable "ssh_username" {
2626
default = "gem5"
2727
}
2828

29+
variable "qemu_path" {
30+
type = string
31+
default = "/usr/bin/qemu-system-x86_64"
32+
}
33+
2934
source "qemu" "initialize" {
3035
accelerator = "kvm"
3136
boot_command = ["e<wait>",
@@ -43,7 +48,7 @@ source "qemu" "initialize" {
4348
iso_urls = ["https://releases.ubuntu.com/24.04.2/ubuntu-24.04.2-live-server-amd64.iso"]
4449
memory = "8192"
4550
output_directory = "disk-image"
46-
qemu_binary = "/usr/bin/qemu-system-x86_64"
51+
qemu_binary = "${var.qemu_path}"
4752
qemuargs = [["-cpu", "host"], ["-display", "none"]]
4853
shutdown_command = "echo '${var.ssh_password}'|sudo -S shutdown -P now"
4954
ssh_password = "${var.ssh_password}"
@@ -97,10 +102,15 @@ build {
97102
}
98103

99104
provisioner "file" {
100-
destination = "/usr/lib/firmware/amdgpu/ip_discovery.bin"
105+
destination = "/usr/lib/firmware/amdgpu/mi300_discovery"
101106
source = "files/mi300_discovery"
102107
}
103108

109+
provisioner "file" {
110+
destination = "/usr/lib/firmware/amdgpu/mi350_discovery"
111+
source = "files/mi350_discovery"
112+
}
113+
104114
provisioner "file" {
105115
source = "/home/gem5/vmlinux-gpu-ml"
106116
destination = "vmlinux-gpu-ml"

0 commit comments

Comments
 (0)