Misc fixes for FP4 MOE and Quant

pavanimajety · pavanimajety · commit 53bea8c86f3c · 2025-11-03T15:18:20.000-08:00
Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -31,6 +31,7 @@
 
 namespace vllm {
 
+#define round_up(x, y) ((x + y - 1) / y * y)
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +43,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    int sf_m = round_up(numRows, 128);
+    int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+    int sf_n = round_up(sf_n_unpadded, 4) / 4;
+    for(int row = numRows; row < sf_m; row += 1) {
+      for(int col = sf_n_unpadded; col < sf_n; col +=1) {
+        SFout[row * sf_n + col] = 0x00;
+      }
+    }
+  } 
+  
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
   // Input tensor row/col loops.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +76,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
               rowIdx, colIdx, numCols, SFout);
 
       out_pos =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
     }
   }
 }
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -168,9 +168,11 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
     out, out_scale = ops.scaled_fp4_quant(x, global_scale)
-
     scale_ans = recover_swizzled_scales(out_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
-
+    print(f"out_ans: {out_ans}")
+    print(f"out_ref: {out_ref}")
+    print(f"scale_ans: {scale_ans}")
+    print(f"scale_ref: {scale_ref}")
     torch.testing.assert_close(out_ans, out_ref)
     torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1381,7 +1381,7 @@ def scaled_fp4_quant(
     rounded_m = round_up(m, 128)
     scale_n = n // block_size
     rounded_n = round_up(scale_n, 4)
-    output_scale = torch.zeros(
+    output_scale = torch.empty(
         (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
     )
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -154,7 +154,7 @@
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "throughput"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -138,6 +138,8 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
+            if block_quant:
+                raise ValueError("FlashInfer FP8 MoE CUTLASS backend does not support block quantization")
             logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -1034,14 +1034,14 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             return None
         return remapped_name
 
-    if any("mla_attn" in key for key in params_dict):
-        attn_str = "mla_attn.mla_attn"
-        logger.debug_once(
-            f"Found mla_attn with k_scale and v_scale in "
-            f"the checkpoint, using {attn_str} as attn_str"
-        )
-    else:
-        attn_str = "attn"
+    # if any("mla_attn" in key for key in params_dict):
+    #     attn_str = "mla_attn.mla_attn"
+    #     logger.debug_once(
+    #         f"Found mla_attn with k_scale and v_scale in "
+    #         f"the checkpoint, using {attn_str} as attn_str"
+    #     )
+    # else:
+    attn_str = "attn"
     # Define scale name mapping patterns in order of precedence
     scale_mapping_patterns = [
         # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale ->
@@ -1068,13 +1068,14 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
             if re.search(pattern, name):
                 remapped_name = re.sub(pattern, replacement, name)
                 if remapped_name not in params_dict:
+                    # find the scale type in params_dict
+                    params_scale_name = "<not found>"
                     scale_type = name.split(".")[-1]
+                    print(params_dict.keys())
                     logger.warning_once(
-                        "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
-                        scale_type,
-                        name,
-                        remapped_name,
-                        scale_type,
+                        f"Found {scale_type} in the checkpoint (e.g. {name}), but not found the remapped name in the model "
+                        f" (e.g. {remapped_name}). {scale_type} is not loaded."
+                        # f"Expected format is {params_scale_name} ",  # noqa: E501
                     )
                     return None
                 return remapped_name

Original file line number	Diff line number	Diff line change
`@@ -1381,7 +1381,7 @@ def scaled_fp4_quant(`
`1381`	`1381`	`rounded_m = round_up(m, 128)`
`1382`	`1382`	`scale_n = n // block_size`
`1383`	`1383`	`rounded_n = round_up(scale_n, 4)`
`1384`		`- output_scale = torch.zeros(`
	`1384`	`+ output_scale = torch.empty(`
`1385`	`1385`	`(rounded_m, rounded_n // 4), device=device, dtype=torch.int32`
`1386`	`1386`	`)`
`1387`	`1387`