From 25d0c83e078706cc0a558277afcebfeb80c516c7 Mon Sep 17 00:00:00 2001
From: Jvst Me <git@jvst.me>
Date: Tue, 28 Oct 2025 23:18:30 +0100
Subject: [PATCH] Drop hardcoded Hot Aisle VM specs

Use the spec object from gpuhunt offers instead.
This allows newly added instance types with
different CPU, RAM, disk, and GPU count
configurations to automatically become available
in dstack. However, limit the supported GPUs to
MI300X, since other GPUs and CPU-only VMs might
need to be tested by the dstack team before they
become available to users.
---
 pyproject.toml                                |   6 +-
 .../core/backends/hotaisle/compute.py         | 108 ++++--------------
 2 files changed, 30 insertions(+), 84 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d4f6f1cb..ea0a5e1c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
     "python-multipart>=0.0.16",
     "filelock",
     "psutil",
-    "gpuhunt==0.1.11",
+    # TODO: release and pin new version
+    "gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip",
     "argcomplete>=3.5.0",
     "ignore-python>=0.2.0",
     "orjson",
@@ -67,6 +68,9 @@ artifacts = [
     "src/dstack/_internal/server/statics/**",
 ]
 
+[tool.hatch.metadata]
+allow-direct-references = true  # TODO: unset
+
 [tool.hatch.metadata.hooks.fancy-pypi-readme]
 content-type = "text/markdown"
 
diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py
index 200173b1f..10013b22a 100644
--- a/src/dstack/_internal/core/backends/hotaisle/compute.py
+++ b/src/dstack/_internal/core/backends/hotaisle/compute.py
@@ -2,7 +2,7 @@
 import subprocess
 import tempfile
 from threading import Thread
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import gpuhunt
 from gpuhunt.providers.hotaisle import HotAisleProvider
@@ -22,6 +22,7 @@
 from dstack._internal.core.models.instances import (
     InstanceAvailability,
     InstanceConfiguration,
+    InstanceOffer,
     InstanceOfferWithAvailability,
 )
 from dstack._internal.core.models.placement import PlacementGroup
@@ -31,48 +32,7 @@
 logger = get_logger(__name__)
 
 
-INSTANCE_TYPE_SPECS = {
-    "1x MI300X 8x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "1x MI300X 13x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "2x MI300X 26x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "2x MI300X 26x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "4x MI300X 52x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "4x MI300X 52x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "8x MI300X 104x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "8x MI300X 104x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-}
+SUPPORTED_GPUS = ["MI300X"]
 
 
 class HotAisleCompute(
@@ -95,45 +55,15 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability
             backend=BackendType.HOTAISLE,
             locations=self.config.regions or None,
             catalog=self.catalog,
+            extra_filter=_supported_instances,
         )
-        supported_offers = []
-        for offer in offers:
-            if offer.instance.name in INSTANCE_TYPE_SPECS:
-                supported_offers.append(
-                    InstanceOfferWithAvailability(
-                        **offer.dict(), availability=InstanceAvailability.AVAILABLE
-                    )
-                )
-            else:
-                logger.warning(
-                    f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
-                )
-        return supported_offers
-
-    def get_payload_from_offer(self, instance_type) -> dict:
-        instance_type_name = instance_type.name
-        cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
-        cpu_cores = instance_type.resources.cpus
-
-        return {
-            "cpu_cores": cpu_cores,
-            "cpus": {
-                "count": 1,
-                "manufacturer": cpu_specs["cpu_manufacturer"],
-                "model": cpu_specs["cpu_model"],
-                "cores": cpu_cores,
-                "frequency": cpu_specs["cpu_frequency"],
-            },
-            "disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
-            "ram_capacity": instance_type.resources.memory_mib * 1024**2,
-            "gpus": [
-                {
-                    "count": len(instance_type.resources.gpus),
-                    "manufacturer": instance_type.resources.gpus[0].vendor,
-                    "model": instance_type.resources.gpus[0].name,
-                }
-            ],
-        }
+        return [
+            InstanceOfferWithAvailability(
+                **offer.dict(),
+                availability=InstanceAvailability.AVAILABLE,
+            )
+            for offer in offers
+        ]
 
     def create_instance(
         self,
@@ -143,8 +73,10 @@ def create_instance(
     ) -> JobProvisioningData:
         project_ssh_key = instance_config.ssh_keys[0]
         self.api_client.upload_ssh_key(project_ssh_key.public)
-        vm_payload = self.get_payload_from_offer(instance_offer.instance)
-        vm_data = self.api_client.create_virtual_machine(vm_payload)
+        offer_backend_data: HotAisleOfferBackendData = (
+            HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
+        )
+        vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs)
         return JobProvisioningData(
             backend=instance_offer.backend,
             instance_type=instance_offer.instance,
@@ -240,6 +172,12 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
         )
 
 
+def _supported_instances(offer: InstanceOffer) -> bool:
+    return len(offer.instance.resources.gpus) > 0 and all(
+        gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus
+    )
+
+
 class HotAisleInstanceBackendData(CoreModel):
     ip_address: str
 
@@ -247,3 +185,7 @@ class HotAisleInstanceBackendData(CoreModel):
     def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
         assert raw is not None
         return cls.__response__.parse_raw(raw)
+
+
+class HotAisleOfferBackendData(CoreModel):
+    vm_specs: dict[str, Any]