clean up comments and prints

pavanimajety · pavanimajety · commit cd262df73371 · 2025-10-27T10:05:00.000-07:00
Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -139,7 +139,10 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
             if block_quant:
-                raise ValueError("FlashInfer FP8 MoE CUTLASS backend does not support block quantization")
+                raise ValueError("FlashInfer FP8 MoE throughput backend does not "
+                                       "support block quantization. Please use "
+                                       "VLLM_FLASHINFER_MOE_BACKEND=latency "
+                                       "instead.")
             logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -224,7 +224,7 @@ def is_layer_excluded(self, prefix: str) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+        from vllm.attention.layer import Attention, MLAAttention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
             if self.is_layer_excluded(prefix):
@@ -233,7 +233,7 @@ def get_quant_method(
             if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, Attention) or isinstance(layer, MLAAttention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelOptFp8MoEMethod(self, layer)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -1021,14 +1021,14 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             return None
         return remapped_name
 
-    # if any("mla_attn" in key for key in params_dict):
-    #     attn_str = "mla_attn.mla_attn"
-    #     logger.debug_once(
-    #         f"Found mla_attn with k_scale and v_scale in "
-    #         f"the checkpoint, using {attn_str} as attn_str"
-    #     )
-    # else:
-    attn_str = "attn"
+    if any("mla_attn" in key for key in params_dict):
+        attn_str = "mla_attn.mla_attn"
+        logger.debug_once(
+            f"Found mla_attn with k_scale and v_scale in "
+            f"the checkpoint, using {attn_str} as attn_str"
+        )
+    else:
+        attn_str = "attn"
     # Define scale name mapping patterns in order of precedence
     scale_mapping_patterns = [
         # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale ->
@@ -1055,14 +1055,13 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             if re.search(pattern, name):
                 remapped_name = re.sub(pattern, replacement, name)
                 if remapped_name not in params_dict:
-                    # find the scale type in params_dict
-                    params_scale_name = "<not found>"
                     scale_type = name.split(".")[-1]
-                    print(params_dict.keys())
                     logger.warning_once(
-                        f"Found {scale_type} in the checkpoint (e.g. {name}), but not found the remapped name in the model "
-                        f" (e.g. {remapped_name}). {scale_type} is not loaded."
-                        # f"Expected format is {params_scale_name} ",  # noqa: E501
+                        "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                        scale_type,
+                        name,
+                        remapped_name,
+                        scale_type,
                     )
                     return None
                 return remapped_name