@@ -2747,6 +2747,8 @@ def __post_init__(self):
27472747 self .measure ()
27482748
27492749 def measure (self ):
2750+ from vllm .platforms import current_platform
2751+
27502752 # we measure the torch peak memory usage via allocated_bytes,
27512753 # rather than `torch.cuda.memory_reserved()` .
27522754 # After `torch.cuda.reset_peak_memory_stats()`,
@@ -2756,6 +2758,24 @@ def measure(self):
27562758 "allocated_bytes.all.peak" , 0 )
27572759
27582760 self .free_memory , self .total_memory = torch .cuda .mem_get_info ()
2761+ shared_sysmem_device_mem_sms = (
2762+ (8 , 7 ), (11 , 0 ), (12 , 1 )) # Orin, Thor, Spark
2763+ if current_platform .is_cuda () and \
2764+ current_platform .get_device_capability () in \
2765+ shared_sysmem_device_mem_sms :
2766+ # On UMA (Orin, Thor and Spark) platform,
2767+ # where both CPU and GPU rely on system memory,
2768+ # the cudaMemGetInfo function shows the amount of free system memory
2769+ # rather than what’s actually available.
2770+ # In the case,
2771+ # torch.cuda.mem_get_info() only reports "free" memory,
2772+ # which can be lower than what is actually
2773+ # available due to not including cache memory.
2774+ # There’s also a comprehensive reference page
2775+ # that explains how you can compute the proper value yourself.
2776+ # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
2777+ self .free_memory = psutil .virtual_memory ().available
2778+
27592779 self .cuda_memory = self .total_memory - self .free_memory
27602780
27612781 # torch.cuda.memory_reserved() is how many bytes
0 commit comments