Skip to content

Commit 4f8f47e

Browse files
jaslProExpertProg
andauthored
Fix undefined symbol: cutlass_moe_mm_sm100 (vllm-project#26098)
Signed-off-by: Jun Jiang <jasl9187@hotmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
1 parent d78fda7 commit 4f8f47e

File tree

3 files changed

+23
-3
lines changed

3 files changed

+23
-3
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
667667
endif()
668668

669669
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
670-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
670+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
671671
else()
672672
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
673673
endif()

csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,15 +254,15 @@ void cutlass_moe_mm(
254254
bool per_act_token, bool per_out_ch) {
255255
int32_t version_num = get_sm_version_num();
256256
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
257-
if (version_num >= 100) {
257+
if (version_num >= 100 && version_num < 110) {
258258
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
259259
expert_offsets, problem_sizes, a_strides, b_strides,
260260
c_strides, per_act_token, per_out_ch);
261261
return;
262262
}
263263
#endif
264264
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
265-
if (version_num >= 90) {
265+
if (version_num >= 90 && version_num < 100) {
266266
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
267267
expert_offsets, problem_sizes, a_strides, b_strides,
268268
c_strides, per_act_token, per_out_ch);

vllm/utils/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2747,6 +2747,8 @@ def __post_init__(self):
27472747
self.measure()
27482748

27492749
def measure(self):
2750+
from vllm.platforms import current_platform
2751+
27502752
# we measure the torch peak memory usage via allocated_bytes,
27512753
# rather than `torch.cuda.memory_reserved()` .
27522754
# After `torch.cuda.reset_peak_memory_stats()`,
@@ -2756,6 +2758,24 @@ def measure(self):
27562758
"allocated_bytes.all.peak", 0)
27572759

27582760
self.free_memory, self.total_memory = torch.cuda.mem_get_info()
2761+
shared_sysmem_device_mem_sms = (
2762+
(8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark
2763+
if current_platform.is_cuda() and \
2764+
current_platform.get_device_capability() in \
2765+
shared_sysmem_device_mem_sms:
2766+
# On UMA (Orin, Thor and Spark) platform,
2767+
# where both CPU and GPU rely on system memory,
2768+
# the cudaMemGetInfo function shows the amount of free system memory
2769+
# rather than what’s actually available.
2770+
# In the case,
2771+
# torch.cuda.mem_get_info() only reports "free" memory,
2772+
# which can be lower than what is actually
2773+
# available due to not including cache memory.
2774+
# There’s also a comprehensive reference page
2775+
# that explains how you can compute the proper value yourself.
2776+
# https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
2777+
self.free_memory = psutil.virtual_memory().available
2778+
27592779
self.cuda_memory = self.total_memory - self.free_memory
27602780

27612781
# torch.cuda.memory_reserved() is how many bytes

0 commit comments

Comments
 (0)