From 25d0c83e078706cc0a558277afcebfeb80c516c7 Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Tue, 28 Oct 2025 23:18:30 +0100 Subject: [PATCH] Drop hardcoded Hot Aisle VM specs Use the spec object from gpuhunt offers instead. This allows newly added instance types with different CPU, RAM, disk, and GPU count configurations to automatically become available in dstack. However, limit the supported GPUs to MI300X, since other GPUs and CPU-only VMs might need to be tested by the dstack team before they become available to users. --- pyproject.toml | 6 +- .../core/backends/hotaisle/compute.py | 108 ++++-------------- 2 files changed, 30 insertions(+), 84 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3d4f6f1cb..ea0a5e1c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.11", + # TODO: release and pin new version + "gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", @@ -67,6 +68,9 @@ artifacts = [ "src/dstack/_internal/server/statics/**", ] +[tool.hatch.metadata] +allow-direct-references = true # TODO: unset + [tool.hatch.metadata.hooks.fancy-pypi-readme] content-type = "text/markdown" diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py index 200173b1f..10013b22a 100644 --- a/src/dstack/_internal/core/backends/hotaisle/compute.py +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -2,7 +2,7 @@ import subprocess import tempfile from threading import Thread -from typing import List, Optional +from typing import Any, List, Optional import gpuhunt from gpuhunt.providers.hotaisle import HotAisleProvider @@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, + InstanceOffer, InstanceOfferWithAvailability, ) from dstack._internal.core.models.placement import PlacementGroup @@ -31,48 +32,7 @@ logger = get_logger(__name__) -INSTANCE_TYPE_SPECS = { - "1x MI300X 8x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "1x MI300X 13x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "2x MI300X 26x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "2x MI300X 26x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "4x MI300X 52x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "4x MI300X 52x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "8x MI300X 104x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "8x MI300X 104x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, -} +SUPPORTED_GPUS = ["MI300X"] class HotAisleCompute( @@ -95,45 +55,15 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability backend=BackendType.HOTAISLE, locations=self.config.regions or None, catalog=self.catalog, + extra_filter=_supported_instances, ) - supported_offers = [] - for offer in offers: - if offer.instance.name in INSTANCE_TYPE_SPECS: - supported_offers.append( - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) - ) - else: - logger.warning( - f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}" - ) - return supported_offers - - def get_payload_from_offer(self, instance_type) -> dict: - instance_type_name = instance_type.name - cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name] - cpu_cores = instance_type.resources.cpus - - return { - "cpu_cores": cpu_cores, - "cpus": { - "count": 1, - "manufacturer": cpu_specs["cpu_manufacturer"], - "model": cpu_specs["cpu_model"], - "cores": cpu_cores, - "frequency": cpu_specs["cpu_frequency"], - }, - "disk_capacity": instance_type.resources.disk.size_mib * 1024**2, - "ram_capacity": instance_type.resources.memory_mib * 1024**2, - "gpus": [ - { - "count": len(instance_type.resources.gpus), - "manufacturer": instance_type.resources.gpus[0].vendor, - "model": instance_type.resources.gpus[0].name, - } - ], - } + return [ + InstanceOfferWithAvailability( + **offer.dict(), + availability=InstanceAvailability.AVAILABLE, + ) + for offer in offers + ] def create_instance( self, @@ -143,8 +73,10 @@ def create_instance( ) -> JobProvisioningData: project_ssh_key = instance_config.ssh_keys[0] self.api_client.upload_ssh_key(project_ssh_key.public) - vm_payload = self.get_payload_from_offer(instance_offer.instance) - vm_data = self.api_client.create_virtual_machine(vm_payload) + offer_backend_data: HotAisleOfferBackendData = ( + HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data) + ) + vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs) return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, @@ -240,6 +172,12 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): ) +def _supported_instances(offer: InstanceOffer) -> bool: + return len(offer.instance.resources.gpus) > 0 and all( + gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus + ) + + class HotAisleInstanceBackendData(CoreModel): ip_address: str @@ -247,3 +185,7 @@ class HotAisleInstanceBackendData(CoreModel): def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": assert raw is not None return cls.__response__.parse_raw(raw) + + +class HotAisleOfferBackendData(CoreModel): + vm_specs: dict[str, Any]