Misc fixes again

pavanimajety · pavanimajety · commit 1fb4b91d1957 · 2025-10-27T15:30:11.000-07:00
Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -47,8 +47,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
   int sf_n_uint32 = round_up(sf_n_unpadded, 4) / 4;
   for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
-    for (int col = (sf_n_unpadded + 3) / 4 + threadIdx.x; 
-         col < sf_n_uint32; 
+    for (int col = (sf_n_unpadded + 3) / 4 + threadIdx.x; col < sf_n_uint32;
          col += blockDim.x) {
       SFout[row * sf_n_uint32 + col] = 0x00000000;
     }
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1281,7 +1281,7 @@ def get_vllm_port() -> int | None:
     "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
         "VLLM_NVFP4_GEMM_BACKEND",
         None,
-        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass"],
+        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"],
     ),
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -50,6 +50,9 @@ def __init__(self):
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -964,6 +964,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
+            self.backend = "cutlass"
+            assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
 
         if self.backend == "none":
             raise ValueError(