flashinfer-ai
diff --git a/‎benchmarks/README.md‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/bench_trtllm_gen_fused_moe_autotuner.py‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/bench_trtllm_gen_fused_moe_autotuner.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/routines/flashinfer_benchmark_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/routines/flashinfer_benchmark_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/routines/moe.py‎
Lines changed: 37 additions & 0 deletions b/‎benchmarks/routines/moe.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎benchmarks/samples/sample_testlist_output.csv‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/samples/sample_testlist_output.csv‎
Lines changed: 1 addition & 1 deletion
@@ -166,7 +166,8 @@ The output CSV will contain detailed metrics including:
 | `--topk_group`           | Number of groups to consider for top-k routing. Default: 1                                                 |
 | `--routed_scaling_factor`| Scaling factor for routing. Default: 2.5                                                                   |
 | `--local_expert_offset`  | Offset of local experts in global expert space. Default: 0                                                 |
-| `--local_num_experts`    | Number of experts handled by this device. Default: equals num_experts                                      |                                                                    |
+| `--local_num_experts`    | Number of experts handled by this device. Default: equals num_experts                                      |
+| `--tile_tokens_dim`      | Tile dimension for tokens. Default: 8                                                                      |
 | `--routing_method`       | Routing method: `renormalize`, `deepseek_v3`, `llama4`, `renormalize_naive`. Default: `deepseek_v3`.       |
 | `--use_shuffled_weight`  | Whether to use shuffled weight layout                                                                      |
 | `--weight_layout`        | Weight layout: 0=MajorK, 1=MajorMn,  2=BlockMajorK. Default: 0                                             |
 
@@ -114,6 +114,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8(
             0,  # local_expert_offset
             num_experts,
             2.5,  # routed_scaling_factor
+            None,  # tile_tokens_dim
             RoutingMethodType.DeepSeekV3.value,
             True,  # use_shuffled_weight
             WeightLayout.BlockMajorK.value,  # weight_layout
@@ -141,6 +142,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8(
             num_experts,
             1.0,  # routed_scaling_factor
             False,  # use_routing_scales_on_input
+            None,  # tile_tokens_dim
             RoutingMethodType.TopK.value,
             enable_pdl,
             num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
@@ -285,6 +287,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4(
         0,  # local_expert_offset
         num_experts,
         None,  # routed_scaling_factor
+        None,  # tile_tokens_dim
         RoutingMethodType.Renormalize.value,
         True,
         enable_pdl,
 
@@ -53,6 +53,7 @@
         "routed_scaling_factor",
         "local_expert_offset",
         "local_num_experts",
+        "tile_tokens_dim",
         "routing_method",
         "use_shuffled_weight",
         "weight_layout",
 
@@ -116,6 +116,13 @@ def parse_moe_args(line, parser):
         default=None,
         help="Number of experts handled by this device. Defaults to num_experts.",
     )
+    parser.add_argument(
+        "--tile_tokens_dim",
+        type=int,
+        required=False,
+        default=8,
+        help="Tile dimension for tokens.",
+    )
     parser.add_argument(
         "--routing_method",
         type=str,
@@ -553,6 +560,7 @@ def testTrtllmFp4BlockScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
+    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
@@ -697,6 +705,7 @@ def run_fp4_moe():
             local_expert_offset=local_expert_offset,
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
+            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
             gated_act_type=gated_act_type,
             do_finalize=True,
@@ -771,6 +780,7 @@ def run_fp4_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
+        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_shuffled_weight"] = use_shuffled_weight
         cur_res["weight_layout"] = weight_layout
@@ -1175,6 +1185,7 @@ def testTrtllmFp8BlockScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
+    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
@@ -1266,6 +1277,27 @@ def testTrtllmFp8BlockScaleMoe(args):
         print(f"[VVERBOSE] gemm1_weights_fp8.shape = {gemm1_weights_fp8.shape}")
         print(f"[VVERBOSE] gemm2_weights_fp8.shape = {gemm2_weights_fp8.shape}")
 
+    # Match test heuristic for tile_tokens_dim when using BlockMajorK
+    if use_shuffled_weight and weight_layout == WeightLayout.BlockMajorK:
+
+        def _next_pow2(x: int) -> int:
+            x = max(1, x)
+            x -= 1
+            x |= x >> 1
+            x |= x >> 2
+            x |= x >> 4
+            x |= x >> 8
+            x |= x >> 16
+            return x + 1
+
+        tokens_per_expert = max(1, (num_tokens * top_k) // max(local_num_experts, 1))
+        suggested_tile = min(max(_next_pow2(tokens_per_expert), 8), 64)
+        if suggested_tile != tile_tokens_dim and args.verbose >= 1:
+            print(
+                f"[INFO] Overriding tile_tokens_dim {tile_tokens_dim} -> {suggested_tile} for BlockMajorK"
+            )
+        tile_tokens_dim = suggested_tile
+
     def run_fp8_block_moe():
         # Quantize hidden states to FP8 for block scale MOE
         hidden_states_fp8 = hidden_states.to(torch.float8_e4m3fn)
@@ -1288,6 +1320,7 @@ def run_fp8_block_moe():
             local_expert_offset=local_expert_offset,
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
+            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
             use_shuffled_weight=use_shuffled_weight,
             weight_layout=weight_layout,
@@ -1348,6 +1381,7 @@ def run_fp8_block_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
+        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_shuffled_weight"] = use_shuffled_weight
         cur_res["weight_layout"] = weight_layout
@@ -1414,6 +1448,7 @@ def testTrtllmFp8PerTensorScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
+    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_routing_scales_on_input = args.use_routing_scales_on_input
     is_cuda_graph_compatible = not args.no_cuda_graph
@@ -1492,6 +1527,7 @@ def run_fp8_per_tensor_moe():
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
             use_routing_scales_on_input=use_routing_scales_on_input,
+            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
         )
 
@@ -1549,6 +1585,7 @@ def run_fp8_per_tensor_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
+        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_routing_bias"] = args.use_routing_bias
         cur_res["use_routing_scales_on_input"] = use_routing_scales_on_input
 
@@ -1,4 +1,4 @@
-routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
+routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,tile_tokens_dim,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
 BatchPrefillWithPagedKVCacheWrapper,0.01244799979031086,0.0009464459008260536,13.963516944729905,0.3050282827732261,fa2,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
 BatchPrefillWithPagedKVCacheWrapper,0.01839040070772171,0.00021363710731210026,9.45155349045863,0.20646597430613514,cudnn,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
 BatchPrefillWithPagedKVCacheWrapper,0.008396799862384795,5.550615129103214e-05,20.70048814413847,0.45219512936224815,trtllm-gen,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		-routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
	`1`	+routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,tile_tokens_dim,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
`2`	`2`	BatchPrefillWithPagedKVCacheWrapper,0.01244799979031086,0.0009464459008260536,13.963516944729905,0.3050282827732261,fa2,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
`3`	`3`	BatchPrefillWithPagedKVCacheWrapper,0.01839040070772171,0.00021363710731210026,9.45155349045863,0.20646597430613514,cudnn,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
`4`	`4`	BatchPrefillWithPagedKVCacheWrapper,0.008396799862384795,5.550615129103214e-05,20.70048814413847,0.45219512936224815,trtllm-gen,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B