Add parallelization and fix codex review comment

pavanimajety · pavanimajety · commit 1064bed05eba · 2025-10-27T15:14:47.000-07:00
Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -43,14 +43,14 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    int sf_m = round_up(numRows, 128);
-    int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
-    int sf_n = round_up(sf_n_unpadded, 4) / 4;
-    for (int row = numRows; row < sf_m; row += 1) {
-      for (int col = sf_n_unpadded; col < sf_n; col += 1) {
-        SFout[row * sf_n + col] = 0x00;
-      }
+  int sf_m = round_up(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_uint32 = round_up(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    for (int col = (sf_n_unpadded + 3) / 4 + threadIdx.x; 
+         col < sf_n_uint32; 
+         col += blockDim.x) {
+      SFout[row * sf_n_uint32 + col] = 0x00000000;
     }
   }
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1176,7 +1176,7 @@ def get_vllm_port() -> int | None:
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "throughput", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
     ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for