Skip to content

Commit 1fb4b91

Browse files
committed
Misc fixes again
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
1 parent 1064bed commit 1fb4b91

File tree

4 files changed

+8
-3
lines changed

4 files changed

+8
-3
lines changed

csrc/quantization/fp4/nvfp4_quant_kernels.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
4747
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
4848
int sf_n_uint32 = round_up(sf_n_unpadded, 4) / 4;
4949
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
50-
for (int col = (sf_n_unpadded + 3) / 4 + threadIdx.x;
51-
col < sf_n_uint32;
50+
for (int col = (sf_n_unpadded + 3) / 4 + threadIdx.x; col < sf_n_uint32;
5251
col += blockDim.x) {
5352
SFout[row * sf_n_uint32 + col] = 0x00000000;
5453
}

vllm/envs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,7 @@ def get_vllm_port() -> int | None:
12811281
"VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
12821282
"VLLM_NVFP4_GEMM_BACKEND",
12831283
None,
1284-
["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass"],
1284+
["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"],
12851285
),
12861286
# Controls garbage collection during CUDA graph capture.
12871287
# If set to 0 (default), enables GC freezing to speed up capture time.

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ def __init__(self):
5050
elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
5151
self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
5252
assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
53+
elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
54+
self.backend = "cutlass"
55+
assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
5356

5457
if self.backend == "none":
5558
raise ValueError(

vllm/model_executor/layers/quantization/modelopt.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
964964
elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
965965
self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
966966
assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
967+
elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass":
968+
self.backend = "cutlass"
969+
assert cutlass_fp4_supported(), "Cutlass is required for {self.backend}"
967970

968971
if self.backend == "none":
969972
raise ValueError(

0 commit comments

Comments
 (0)