Fix col start

pavanimajety · pavanimajety · commit 326fa96a128f · 2025-11-03T15:18:20.000-08:00
Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -43,17 +43,17 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    int sf_m = round_up(numRows, 128);
-    int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
-    int sf_n = round_up(sf_n_unpadded, 4) / 4;
-    for(int row = numRows; row < sf_m; row += 1) {
-      for(int col = sf_n_unpadded; col < sf_n; col +=1) {
-        SFout[row * sf_n + col] = 0x00;
-      }
+  int sf_m = round_up(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_uint32 = round_up(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_uint32;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_uint32 + col] = 0x00000000;
     }
-  } 
-  
+  }
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -170,9 +170,5 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     out, out_scale = ops.scaled_fp4_quant(x, global_scale)
     scale_ans = recover_swizzled_scales(out_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
-    print(f"out_ans: {out_ans}")
-    print(f"out_ref: {out_ref}")
-    print(f"scale_ans: {scale_ans}")
-    print(f"scale_ref: {scale_ref}")
     torch.testing.assert_close(out_ans, out_ref)
     torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1208,7 +1208,7 @@ def get_vllm_port() -> int | None:
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "throughput", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
     ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
@@ -1313,7 +1313,7 @@ def get_vllm_port() -> int | None:
     "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
         "VLLM_NVFP4_GEMM_BACKEND",
         None,
-        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass"],
+        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"],
     ),
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -50,6 +50,9 @@ def __init__(self):
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -139,7 +139,12 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
             if block_quant:
-                raise ValueError("FlashInfer FP8 MoE CUTLASS backend does not support block quantization")
+                raise ValueError(
+                    "FlashInfer FP8 MoE throughput backend does not "
+                    "support block quantization. Please use "
+                    "VLLM_FLASHINFER_MOE_BACKEND=latency "
+                    "instead."
+                )
             logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -224,7 +224,10 @@ def is_layer_excluded(self, prefix: str) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
 
         if isinstance(layer, LinearBase):
             if self.is_layer_excluded(prefix):
@@ -233,7 +236,7 @@ def get_quant_method(
             if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             return ModelOptFp8MoEMethod(self, layer)
@@ -905,7 +908,10 @@ def is_layer_excluded(self, prefix: str) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
+        from vllm.attention.layer import (  # Avoid circular import
+            Attention,
+            MLAAttention,
+        )
 
         skip_layer = self.is_layer_excluded(prefix)
         if isinstance(layer, LinearBase):
@@ -915,7 +921,7 @@ def get_quant_method(
             if "vision_tower" in prefix or "vision_model" in prefix:
                 return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, Attention):
+        elif isinstance(layer, (Attention, MLAAttention)):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
             if skip_layer:
@@ -958,6 +964,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -1034,14 +1034,14 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             return None
         return remapped_name
 
-    # if any("mla_attn" in key for key in params_dict):
-    #     attn_str = "mla_attn.mla_attn"
-    #     logger.debug_once(
-    #         f"Found mla_attn with k_scale and v_scale in "
-    #         f"the checkpoint, using {attn_str} as attn_str"
-    #     )
-    # else:
-    attn_str = "attn"
+    if any("mla_attn" in key for key in params_dict):
+        attn_str = "mla_attn.mla_attn"
+        logger.debug_once(
+            f"Found mla_attn with k_scale and v_scale in "
+            f"the checkpoint, using {attn_str} as attn_str"
+        )
+    else:
+        attn_str = "attn"
     # Define scale name mapping patterns in order of precedence
     scale_mapping_patterns = [
         # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale ->
@@ -1068,14 +1068,13 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             if re.search(pattern, name):
                 remapped_name = re.sub(pattern, replacement, name)
                 if remapped_name not in params_dict:
-                    # find the scale type in params_dict
-                    params_scale_name = "<not found>"
                     scale_type = name.split(".")[-1]
-                    print(params_dict.keys())
                     logger.warning_once(
-                        f"Found {scale_type} in the checkpoint (e.g. {name}), but not found the remapped name in the model "
-                        f" (e.g. {remapped_name}). {scale_type} is not loaded."
-                        # f"Expected format is {params_scale_name} ",  # noqa: E501
+                        "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                        scale_type,
+                        name,
+                        remapped_name,
+                        scale_type,
                     )
                     return None
                 return remapped_name