From 369f71558c7c33beb168f6567171077c56f766cc Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 16 Sep 2025 22:02:18 +0000 Subject: [PATCH 1/2] resources: GPUFS: ROCm 7.0 and MI350 support --- src/x86-ubuntu-gpu-ml/build.sh | 5 +++-- src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh | 1 + src/x86-ubuntu-gpu-ml/files/mi350_discovery | Bin 0 -> 8208 bytes src/x86-ubuntu-gpu-ml/scripts/rocm-install.sh | 21 +++++++++++------- .../x86-ubuntu-gpu-ml.pkr.hcl | 14 ++++++++++-- 5 files changed, 29 insertions(+), 12 deletions(-) create mode 100644 src/x86-ubuntu-gpu-ml/files/mi350_discovery diff --git a/src/x86-ubuntu-gpu-ml/build.sh b/src/x86-ubuntu-gpu-ml/build.sh index 8c9907bad..4a47aa324 100755 --- a/src/x86-ubuntu-gpu-ml/build.sh +++ b/src/x86-ubuntu-gpu-ml/build.sh @@ -14,5 +14,6 @@ fi # Install the needed plugins ./packer init x86-ubuntu-gpu-ml.pkr.hcl -# Build the image -./packer build x86-ubuntu-gpu-ml.pkr.hcl \ No newline at end of file +# Build the image - Pass command line options from this script to build +# command. This can be used to set variable such as qemu path. +./packer build "$@" x86-ubuntu-gpu-ml.pkr.hcl \ No newline at end of file diff --git a/src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh b/src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh index a77422768..f35118831 100644 --- a/src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh +++ b/src/x86-ubuntu-gpu-ml/files/load_amdgpu.sh @@ -24,6 +24,7 @@ insmod /lib/modules/`uname -r`/updates/dkms/amdkcl.ko.zst insmod /lib/modules/`uname -r`/updates/dkms/amd-sched.ko.zst insmod /lib/modules/`uname -r`/updates/dkms/amdxcp.ko.zst insmod /lib/modules/`uname -r`/updates/dkms/amddrm_buddy.ko.zst +insmod /lib/modules/`uname -r`/updates/dkms/amddrm_exec.ko.zst insmod /lib/modules/`uname -r`/updates/dkms/amdttm.ko.zst insmod /lib/modules/`uname -r`/updates/dkms/amddrm_ttm_helper.ko.zst diff --git a/src/x86-ubuntu-gpu-ml/files/mi350_discovery b/src/x86-ubuntu-gpu-ml/files/mi350_discovery new file mode 100644 index 0000000000000000000000000000000000000000..49a8ae271c753ac892e650d6926ce8a9a496baec GIT binary patch literal 8208 zcmds6KTI6y6@Rm{J1p!1d-fhj2*z_Zn_$9P*rISjF~D9RCydaYbs`eQ79?6(M51uv zV$WI;=R#pyC_+}sDO{$w!Ue?@D+ebo&MqPsE?l^96Df+WOo8OQ@BMzSNE68?MJHt> z_BY?}y?NjJ-h1Dl83yVf&qNZ*S9faZqI~(A?m3a;b7$jS5t*L45*;Z1X=*KMD^}0{ zGMZOhJpVL$srZY{*5|Qw&wcv$fBq`c_C5Xk{#URNMHfZls2oXJrKRz}>vb z<<=A#j{_a#%R%#^6euqmE#oENy=))ynDT`30hd1sa%Bs{G9$jb=$f?*t8pIYWs$X47mOV_<~^~ zW!Jw?eZxcsUH`#<>JPd8bLtx=QgQtk)HiHud_v^k7^ydp5uc{UC-7~iMHZLruBqzb zrssUy`Plf}`2**RuGBL&-u3V9~hrIf8hKX*mr(@m3Gb=FEQ;eIA1qDb^e<3AHq(=dGi?c+>|ln z+XeEAOHbhEN5=P@Z#zG3eD3^#^FM-p=jT^xXTo@iX@9}_9~+-Kf6e(x=udNgRrYOu zHRT-m8Sv?-^HbF4ysKc|HRU|@!Kb6nr*7wZ7^m7XY|6*9)3dfzoWJ1qZGP&sGHl94 zw{LB=Pk!3%+_x!K**9!TgZe#dYro`w;`SdvzhKyupSpc(t9|k_ZvQ#$8#d*V+qbsb zCqL`qFn?<00nbv|xScd;K4x{-<^Q3^S&DQyR1ueF=TX4|Lp1mj?s@k!3Fn(n4JjIjkd+AU`EPAIyp6Ma>$PG(UD;nd@18RNt_q`2jDM7u7ec zXnqF6P}2|`y>j~%DPuep%@5>^UrsDzToug^^b51N{oE0fj&iv-Y zGR9kx66MT?7g8DHu1G>T^PdyT7=J|uC}-YsVi|F`r1eoO7Ia_WxF4ZDE@^#$=Xf5W zJ}zl}fTy32P#>4HKESj8N2re-wQmtS#Cb^%sy2`vwNHK-5#9YH;EviSzfb;m;A7{X zkbfGmt0z;ma}3}+@2ARuba{&!2uc&=*ws->U9UcvQx)T19&^F#H%f?n); zed^J#s`;gQe+m3}ct0@D@GEV^SL7cS!13^YU~<+)8*x^73(xEOfyp_XHsY=FyO8Vq zfytTIHsY@GlfW;8{Dzx+qf4(e!+8`>o(4-YTvql z=JqGe{#CVa{I|}h1RqlaeewZ z`82qFRo55llTVGG6e)#q^}jT!{v6_F&;34Sp>Lc2lM+!M=E8v?*q1%``?O_k+5RNp zq;8L}Pcw3ey4v&coZh9r`9CQ!^*N8^5cRd^c|AR$zWIM4%rAZa6j&8GfZrFw{4$<( zbpZb^g!yH>4JYVZ{4Qwx!LyDIygz{fJ0<9VUbQ&zIU0^AXY*BS8UD?!01K3}zgI2m zPh{@5kjIp>-&ZZ>XYS8I{!a5VEc$ZYhPKFSWyAHxens8_Pgl&ZcbXsQ(;s=A;mJ3| zx4_dC^YfkN2m16+UiYX^d<#5XF~8qwexOf(<#nI>#J9lH6&r`j-phaew((y6NAKx> z^q&4p6#UmMeUbaSV1L&S7`D%A{S%LXQSd+LbKe*2`}#Eo-wC8ftONC!Nc-0jN1O-L zKHG=HBVZKbZ}vB!k8^_BXS+tM1EZqbzfb$bd&EQH5ilyb{Rgy9tP*R)IxtGy{&U(V z-Xk6okATsD+kZj(#452ytQ+cgt-*moXof?_c|Vxoq=Bw_RK3cBMdjSl{XbOoxH_2vCYXzczccXY)y9 znOL-Sa959G7%$epdN{l4l@0^*5w43xpMRTu+Ozql_J~DWCuhR`r&~x~Ls%ro`ZyEz zKjS&x9P8su*#C^D-#OOD8J=(Q8vD<&K5DdKAoF;4?7>an|60!PlIQ-n=KLOcJI0Hw zJAaG3Iuou<=Wmne{&&OqE_v>MndhrxdD`^-ujTwM`7-RUIlo7K5d6CHx5y8i!$yWzP_VZpVW-zCoiTyuVp{HXiGdbvfu=KhesP5uM-hkTd( z8TaQ2{O_E=pR?dw_`aj(RPuH3YtHYH{}BAT^S8*4f!}ogHu;ah-*CQb{H)byQQG#_ zuYtMIGQC-=Pv!LhY1fwXp*}U$NAO;DoDb)3<^M>&<9yNhHixbDmxIQvo}Yq#dmL`b zmh%bibN=r*KS2BBJIf`j&UyW(;HdzG3>7e<|SmMr_|VOyBB2^$lbDzG3=S52|k%+xHFA ze_;_S;QPiyji0>W1I4z>3zTz#%EqAAc6m(s1o&(Uxa0Do z_iyt_`v-2Jt{&?54fap@2yolwiT6+W7_jZ~0q>vkDd3LF%ie#_;*ur6E!5{j8T9@s z9|3N=e8~H!d<@ujdByvudJD)} zv-!NN-6h3HR#(D7DH1);+6htrqo^(1+tEzBO|>2VBuv3F /dev/null -echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.4/ubuntu noble main" \ +echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/7.0/ubuntu noble main" \ | sudo tee /etc/apt/sources.list.d/amdgpu.list -echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 noble main" \ +echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0 noble main" \ | sudo tee --append /etc/apt/sources.list.d/rocm.list echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ | sudo tee /etc/apt/preferences.d/rocm-pin-600 @@ -86,8 +86,8 @@ sudo chmod 777 /usr/lib/firmware/amdgpu/ip_discovery.bin # Install a known-working version of Linux as this might change after stable -# release. Installl this after DKMS so they are rebuilt. -KERNEL=6.8.0-60-generic +# release. Install this after DKMS so they are rebuilt. +KERNEL=6.8.0-79-generic sudo apt -y install "linux-image-${KERNEL}" sudo apt -y install "linux-headers-${KERNEL}" "linux-modules-extra-${KERNEL}" @@ -105,6 +105,13 @@ if [ ! -f ./gem5_wmi.ko ]; then fi popd +# Make the discovery files writeable by packer +touch /usr/lib/firmware/amdgpu/mi300_discovery +touch /usr/lib/firmware/amdgpu/mi350_discovery + +chmod 777 /usr/lib/firmware/amdgpu/mi300_discovery +chmod 777 /usr/lib/firmware/amdgpu/mi350_discovery + # Note about pip: This disk is created for the express purpose of being run in # gem5 and is therefore effectively sandboxed enough that we can use the pip # option --break-system-packages. If you plan to modify this disk image with @@ -121,11 +128,9 @@ pip3 install --break-system-packages torch torchvision torchaudio --index-url ht # For a newer version uncomment one below and remove the above install: # Warning: Absurdly slow compared to ROCm 6.0 *in simulation*: -#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4 -# Warning: Missing python module torch.sparse.......: -#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3 +#pip3 install --break-system-packages torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4 # Warning: nightly build, may not work depending on day. Use at your own risk: -#pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4/ --break-system-packages +#pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0 --break-system-packages # Setup gem5 auto login. mv /home/gem5/serial-getty@.service /lib/systemd/system/ diff --git a/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl b/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl index a9e245318..7536373ee 100644 --- a/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl +++ b/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl @@ -26,6 +26,11 @@ variable "ssh_username" { default = "gem5" } +variable "qemu_path" { + type = string + default = "/usr/bin/qemu-system-x86_64" +} + source "qemu" "initialize" { accelerator = "kvm" boot_command = ["e", @@ -43,7 +48,7 @@ source "qemu" "initialize" { iso_urls = ["https://releases.ubuntu.com/24.04.2/ubuntu-24.04.2-live-server-amd64.iso"] memory = "8192" output_directory = "disk-image" - qemu_binary = "/usr/bin/qemu-system-x86_64" + qemu_binary = "${var.qemu_path}" qemuargs = [["-cpu", "host"], ["-display", "none"]] shutdown_command = "echo '${var.ssh_password}'|sudo -S shutdown -P now" ssh_password = "${var.ssh_password}" @@ -97,10 +102,15 @@ build { } provisioner "file" { - destination = "/usr/lib/firmware/amdgpu/ip_discovery.bin" + destination = "/usr/lib/firmware/amdgpu/mi300_discovery" source = "files/mi300_discovery" } + provisioner "file" { + destination = "/usr/lib/firmware/amdgpu/mi350_discovery" + source = "files/mi350_discovery" + } + provisioner "file" { source = "/home/gem5/vmlinux-gpu-ml" destination = "vmlinux-gpu-ml" From 2ef1d93a6dd3b649a4dc5a1e57880bbf513c45d3 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 26 Nov 2025 15:46:44 +0000 Subject: [PATCH 2/2] resources: Rename output to x86-ubuntu-rocm70 --- src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl b/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl index 7536373ee..4e7c29367 100644 --- a/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl +++ b/src/x86-ubuntu-gpu-ml/x86-ubuntu-gpu-ml.pkr.hcl @@ -13,7 +13,7 @@ packer { variable "image_name" { type = string - default = "x86-ubuntu-gpu-ml" + default = "x86-ubuntu-rocm70" } variable "ssh_password" { @@ -113,7 +113,7 @@ build { provisioner "file" { source = "/home/gem5/vmlinux-gpu-ml" - destination = "vmlinux-gpu-ml" + destination = "vmlinux-rocm70" direction = "download" } }