diff --git a/benchmarks/README.md b/benchmarks/README.md index e7e17156a4..d81e9c3642 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -166,8 +166,7 @@ The output CSV will contain detailed metrics including: | `--topk_group` | Number of groups to consider for top-k routing. Default: 1 | | `--routed_scaling_factor`| Scaling factor for routing. Default: 2.5 | | `--local_expert_offset` | Offset of local experts in global expert space. Default: 0 | -| `--local_num_experts` | Number of experts handled by this device. Default: equals num_experts | -| `--tile_tokens_dim` | Tile dimension for tokens. Default: 8 | +| `--local_num_experts` | Number of experts handled by this device. Default: equals num_experts | | | `--routing_method` | Routing method: `renormalize`, `deepseek_v3`, `llama4`, `renormalize_naive`. Default: `deepseek_v3`. | | `--use_shuffled_weight` | Whether to use shuffled weight layout | | `--weight_layout` | Weight layout: 0=MajorK, 1=MajorMn, 2=BlockMajorK. Default: 0 | diff --git a/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py b/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py index 0aff25860e..203faaff82 100644 --- a/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py +++ b/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py @@ -114,7 +114,6 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8( 0, # local_expert_offset num_experts, 2.5, # routed_scaling_factor - None, # tile_tokens_dim RoutingMethodType.DeepSeekV3.value, True, # use_shuffled_weight WeightLayout.BlockMajorK.value, # weight_layout @@ -142,7 +141,6 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8( num_experts, 1.0, # routed_scaling_factor False, # use_routing_scales_on_input - None, # tile_tokens_dim RoutingMethodType.TopK.value, enable_pdl, num_tokens if tune_max_num_tokens is None else tune_max_num_tokens, @@ -287,7 +285,6 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4( 0, # local_expert_offset num_experts, None, # routed_scaling_factor - None, # tile_tokens_dim RoutingMethodType.Renormalize.value, True, enable_pdl, diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py index 8798f8340f..520029f0ec 100644 --- a/benchmarks/routines/flashinfer_benchmark_utils.py +++ b/benchmarks/routines/flashinfer_benchmark_utils.py @@ -53,7 +53,6 @@ "routed_scaling_factor", "local_expert_offset", "local_num_experts", - "tile_tokens_dim", "routing_method", "use_shuffled_weight", "weight_layout", diff --git a/benchmarks/routines/moe.py b/benchmarks/routines/moe.py index 6af3425c73..8f26bdb8f7 100644 --- a/benchmarks/routines/moe.py +++ b/benchmarks/routines/moe.py @@ -116,13 +116,6 @@ def parse_moe_args(line, parser): default=None, help="Number of experts handled by this device. Defaults to num_experts.", ) - parser.add_argument( - "--tile_tokens_dim", - type=int, - required=False, - default=8, - help="Tile dimension for tokens.", - ) parser.add_argument( "--routing_method", type=str, @@ -560,7 +553,6 @@ def testTrtllmFp4BlockScaleMoe(args): ) local_expert_offset = args.local_expert_offset local_num_experts = args.local_num_experts or num_experts - tile_tokens_dim = args.tile_tokens_dim routing_method_type = args.routing_method_type use_shuffled_weight = args.use_shuffled_weight weight_layout = args.weight_layout @@ -705,7 +697,6 @@ def run_fp4_moe(): local_expert_offset=local_expert_offset, local_num_experts=local_num_experts, routed_scaling_factor=routed_scaling_factor, - tile_tokens_dim=tile_tokens_dim, routing_method_type=routing_method_type, gated_act_type=gated_act_type, do_finalize=True, @@ -780,7 +771,6 @@ def run_fp4_moe(): cur_res["routed_scaling_factor"] = routed_scaling_factor cur_res["local_expert_offset"] = local_expert_offset cur_res["local_num_experts"] = local_num_experts - cur_res["tile_tokens_dim"] = tile_tokens_dim cur_res["routing_method"] = args.routing_method cur_res["use_shuffled_weight"] = use_shuffled_weight cur_res["weight_layout"] = weight_layout @@ -1185,7 +1175,6 @@ def testTrtllmFp8BlockScaleMoe(args): ) local_expert_offset = args.local_expert_offset local_num_experts = args.local_num_experts or num_experts - tile_tokens_dim = args.tile_tokens_dim routing_method_type = args.routing_method_type use_shuffled_weight = args.use_shuffled_weight weight_layout = args.weight_layout @@ -1277,27 +1266,6 @@ def testTrtllmFp8BlockScaleMoe(args): print(f"[VVERBOSE] gemm1_weights_fp8.shape = {gemm1_weights_fp8.shape}") print(f"[VVERBOSE] gemm2_weights_fp8.shape = {gemm2_weights_fp8.shape}") - # Match test heuristic for tile_tokens_dim when using BlockMajorK - if use_shuffled_weight and weight_layout == WeightLayout.BlockMajorK: - - def _next_pow2(x: int) -> int: - x = max(1, x) - x -= 1 - x |= x >> 1 - x |= x >> 2 - x |= x >> 4 - x |= x >> 8 - x |= x >> 16 - return x + 1 - - tokens_per_expert = max(1, (num_tokens * top_k) // max(local_num_experts, 1)) - suggested_tile = min(max(_next_pow2(tokens_per_expert), 8), 64) - if suggested_tile != tile_tokens_dim and args.verbose >= 1: - print( - f"[INFO] Overriding tile_tokens_dim {tile_tokens_dim} -> {suggested_tile} for BlockMajorK" - ) - tile_tokens_dim = suggested_tile - def run_fp8_block_moe(): # Quantize hidden states to FP8 for block scale MOE hidden_states_fp8 = hidden_states.to(torch.float8_e4m3fn) @@ -1320,7 +1288,6 @@ def run_fp8_block_moe(): local_expert_offset=local_expert_offset, local_num_experts=local_num_experts, routed_scaling_factor=routed_scaling_factor, - tile_tokens_dim=tile_tokens_dim, routing_method_type=routing_method_type, use_shuffled_weight=use_shuffled_weight, weight_layout=weight_layout, @@ -1381,7 +1348,6 @@ def run_fp8_block_moe(): cur_res["routed_scaling_factor"] = routed_scaling_factor cur_res["local_expert_offset"] = local_expert_offset cur_res["local_num_experts"] = local_num_experts - cur_res["tile_tokens_dim"] = tile_tokens_dim cur_res["routing_method"] = args.routing_method cur_res["use_shuffled_weight"] = use_shuffled_weight cur_res["weight_layout"] = weight_layout @@ -1448,7 +1414,6 @@ def testTrtllmFp8PerTensorScaleMoe(args): ) local_expert_offset = args.local_expert_offset local_num_experts = args.local_num_experts or num_experts - tile_tokens_dim = args.tile_tokens_dim routing_method_type = args.routing_method_type use_routing_scales_on_input = args.use_routing_scales_on_input is_cuda_graph_compatible = not args.no_cuda_graph @@ -1527,7 +1492,6 @@ def run_fp8_per_tensor_moe(): local_num_experts=local_num_experts, routed_scaling_factor=routed_scaling_factor, use_routing_scales_on_input=use_routing_scales_on_input, - tile_tokens_dim=tile_tokens_dim, routing_method_type=routing_method_type, ) @@ -1585,7 +1549,6 @@ def run_fp8_per_tensor_moe(): cur_res["routed_scaling_factor"] = routed_scaling_factor cur_res["local_expert_offset"] = local_expert_offset cur_res["local_num_experts"] = local_num_experts - cur_res["tile_tokens_dim"] = tile_tokens_dim cur_res["routing_method"] = args.routing_method cur_res["use_routing_bias"] = args.use_routing_bias cur_res["use_routing_scales_on_input"] = use_routing_scales_on_input diff --git a/benchmarks/samples/sample_testlist_output.csv b/benchmarks/samples/sample_testlist_output.csv index d856d37ab0..b07c523ecb 100644 --- a/benchmarks/samples/sample_testlist_output.csv +++ b/benchmarks/samples/sample_testlist_output.csv @@ -1,4 +1,4 @@ -routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,tile_tokens_dim,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command +routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command BatchPrefillWithPagedKVCacheWrapper,0.01244799979031086,0.0009464459008260536,13.963516944729905,0.3050282827732261,fa2,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B BatchPrefillWithPagedKVCacheWrapper,0.01839040070772171,0.00021363710731210026,9.45155349045863,0.20646597430613514,cudnn,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B BatchPrefillWithPagedKVCacheWrapper,0.008396799862384795,5.550615129103214e-05,20.70048814413847,0.45219512936224815,trtllm-gen,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B diff --git a/benchmarks/samples/sample_testlist_output.txt b/benchmarks/samples/sample_testlist_output.txt index 69a3961f87..d2c5cc4fa1 100644 --- a/benchmarks/samples/sample_testlist_output.txt +++ b/benchmarks/samples/sample_testlist_output.txt @@ -292,7 +292,7 @@ 2025-09-23 00:32:18,247 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends [PERF] cutlass_autotun:: median time 0.009 ms; std 0.000 ms; achieved tflops 6.372 TFLOPs/sec; achieved tb_per_sec 0.401 TB/sec [PERF] trtllm_autotune:: median time 0.011 ms; std 0.000 ms; achieved tflops 5.410 TFLOPs/sec; achieved tb_per_sec 0.340 TB/sec -[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) +[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) [INFO] Running testTrtllmFp4BlockScaleMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' @@ -303,7 +303,7 @@ [VVERBOSE] gemm1_weights_fp4.shape = torch.Size([256, 2048, 512]) [VVERBOSE] gemm2_weights_fp4.shape = torch.Size([256, 1024, 512]) [PERF] trtllm :: median time 0.224 ms; std 0.000 ms; achieved tflops 230.555 TFLOPs/sec; achieved tb_per_sec 1.818 TB/sec -[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=8, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='renormalize_naive', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=4, gated_act_type=0) +[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=8, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize_naive', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=4, gated_act_type=0) [INFO] Running testTrtllmFp4BlockScaleMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' @@ -314,7 +314,7 @@ [VVERBOSE] gemm1_weights_fp4.shape = torch.Size([128, 2048, 512]) [VVERBOSE] gemm2_weights_fp4.shape = torch.Size([128, 1024, 512]) [PERF] trtllm :: median time 0.226 ms; std 0.000 ms; achieved tflops 227.846 TFLOPs/sec; achieved tb_per_sec 0.903 TB/sec -[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) +[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) [INFO] Running testTrtllmFp8BlockScaleMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' @@ -325,7 +325,7 @@ [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([256, 2048, 1024]) [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([256, 1024, 1024]) [PERF] trtllm :: median time 0.557 ms; std 0.000 ms; achieved tflops 92.607 TFLOPs/sec; achieved tb_per_sec 1.455 TB/sec -[INFO] args = Namespace(routine='trtllm_fp8_per_tensor_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='llama4', use_shuffled_weight=False, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=True, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=3, gated_act_type=0) +[INFO] args = Namespace(routine='trtllm_fp8_per_tensor_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='llama4', use_shuffled_weight=False, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=True, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=3, gated_act_type=0) [INFO] Running testTrtllmFp8PerTensorScaleMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' @@ -336,7 +336,7 @@ [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024]) [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024]) [PERF] trtllm :: median time 0.123 ms; std 0.000 ms; achieved tflops 52.340 TFLOPs/sec; achieved tb_per_sec 3.299 TB/sec -[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='renormalize', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=1, gated_act_type=0) +[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=1, gated_act_type=0) [INFO] Running testTrtllmFp8BlockScaleMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' @@ -347,7 +347,7 @@ [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024]) [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024]) [PERF] trtllm :: median time 0.109 ms; std 0.000 ms; achieved tflops 59.297 TFLOPs/sec; achieved tb_per_sec 3.740 TB/sec -[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_base', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) +[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_base', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0) [INFO] Running testCutlassFusedMoe [INFO] FlashInfer version: 0.3.1 [VVERBOSE] gpu_name = 'NVIDIA_B200' diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu index f3c45e2ec0..fc6393237f 100644 --- a/csrc/trtllm_fused_moe_kernel_launcher.cu +++ b/csrc/trtllm_fused_moe_kernel_launcher.cu @@ -1386,8 +1386,6 @@ Tensor trtllm_fp8_per_tensor_scale_moe( auto launcher = std::make_unique( routing_logits, routing_bias, hidden_states, gemm1_weights, output1_scales_scalar, output1_scales_gate_scalar, gemm2_weights, output2_scales_scalar); - // Note: Original code passes tile_N where tile_tokens_dim is expected - // This seems incorrect but we match the original behavior launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight, weight_layout, use_routing_scales_on_input); @@ -1470,8 +1468,6 @@ Tensor trtllm_fp8_block_scale_moe( auto launcher = std::make_unique( routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights, gemm1_weights_scale, gemm2_weights, gemm2_weights_scale); - // Note: Original code passes tile_N where tile_tokens_dim is expected - // This seems incorrect but we match the original behavior launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight, weight_layout); diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py index 83f186673b..b4444aa431 100644 --- a/flashinfer/fused_moe/core.py +++ b/flashinfer/fused_moe/core.py @@ -1952,7 +1952,6 @@ def trtllm_fp8_per_tensor_scale_moe( local_num_experts: int, routed_scaling_factor: Optional[float], use_routing_scales_on_input: bool, - tile_tokens_dim: Optional[int] = None, routing_method_type: int = 0, enable_pdl: Optional[bool] = None, tune_max_num_tokens: int = 8192, @@ -1977,7 +1976,6 @@ def trtllm_fp8_per_tensor_scale_moe( local_num_experts: Number of experts handled by this device routed_scaling_factor: Scaling factor for routing use_routing_scales_on_input: Whether to use routing scales on input - tile_tokens_dim: Tile dimension for tokens (default: None, will be deprecated in the future) routing_method_type: Type of routing method to use (default: 0) enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90. tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192) @@ -1985,12 +1983,6 @@ def trtllm_fp8_per_tensor_scale_moe( Returns: torch.Tensor: Output tensor of shape [seq_len, hidden_size] """ - if tile_tokens_dim is not None: - logger.warning_once( - "tile_tokens_dim in trtllm_fp8_per_tensor_scale_moe is planned for deprecation " - "in a future release. Please remove it from your code as tile_tokens_dim will no " - "longer be supported after v0.5.0." - ) return get_trtllm_moe_sm100_module().trtllm_fp8_per_tensor_scale_moe( routing_logits, routing_bias, @@ -2032,7 +2024,6 @@ def trtllm_fp8_block_scale_moe( local_expert_offset: int, local_num_experts: int, routed_scaling_factor: Optional[float], - tile_tokens_dim: Optional[int] = None, routing_method_type: int = 0, use_shuffled_weight: bool = False, weight_layout: int = 0, @@ -2058,19 +2049,12 @@ def trtllm_fp8_block_scale_moe( local_expert_offset: Offset of local experts in global expert space local_num_experts: Number of experts handled by this device routed_scaling_factor: Scaling factor for routing - tile_tokens_dim: Tile dimension for tokens (default: None, will be deprecated in the future) routing_method_type: Type of routing method to use (default: 0) enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90. tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192) Returns: torch.Tensor: Output tensor of shape [seq_len, hidden_size] """ - if tile_tokens_dim is not None: - logger.warning_once( - "tile_tokens_dim in trtllm_fp8_block_scale_moe is planned for deprecation " - "in a future release. Please remove it from your code as tile_tokens_dim will no " - "longer be supported after v0.5.0." - ) output = torch.empty( hidden_states.shape, dtype=torch.bfloat16, device=hidden_states.device ) @@ -2125,7 +2109,6 @@ def trtllm_fp4_block_scale_moe( local_expert_offset: int, local_num_experts: int, routed_scaling_factor: Optional[float], - tile_tokens_dim: Optional[int], routing_method_type: int = 0, do_finalize: bool = True, enable_pdl: Optional[bool] = None, @@ -2176,7 +2159,6 @@ def trtllm_fp4_block_scale_moe( local_expert_offset (int): Offset of local experts in global expert space local_num_experts (int): Number of experts handled by this device routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods) - tile_tokens_dim (Optional[int]): Tile dimension for tokens (default: None, will be deprecated in the future) routing_method_type (int): Type of routing method to use (default: 0) - 0: Default (Softmax -> TopK) - 1: Renormalize (TopK -> Softmax) @@ -2195,12 +2177,6 @@ def trtllm_fp4_block_scale_moe( List[torch.Tensor]: List of output tensors. If do_finalize=True, returns the final MoE output. Otherwise, returns intermediate results (gemm2_output, expert_weights, expanded_idx_to_permuted_idx) that need further processing. """ - if tile_tokens_dim is not None: - logger.warning_once( - "tile_tokens_dim in trtllm_fp4_block_scale_moe is planned for deprecation " - "in a future release. Please remove it from your code as tile_tokens_dim will no " - "longer be supported after v0.5.0." - ) return get_trtllm_moe_sm100_module().trtllm_fp4_block_scale_moe( routing_logits, None, @@ -2262,7 +2238,6 @@ def trtllm_fp4_block_scale_routed_moe( local_expert_offset: int, local_num_experts: int, routed_scaling_factor: Optional[float], - tile_tokens_dim: Optional[int], routing_method_type: int = 0, do_finalize: bool = True, enable_pdl: Optional[bool] = None, @@ -2315,7 +2290,6 @@ def trtllm_fp4_block_scale_routed_moe( local_expert_offset (int): Offset of local experts in global expert space local_num_experts (int): Number of experts handled by this device routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods) - tile_tokens_dim (Optional[int]): Tile dimension for tokens (default: None, will be deprecated in the future) routing_method_type (int): Type of routing method to use (default: 0) - 0: Default (Softmax -> TopK) - 1: Renormalize (TopK -> Softmax) @@ -2334,12 +2308,6 @@ def trtllm_fp4_block_scale_routed_moe( List[torch.Tensor]: List of output tensors. If do_finalize=True, returns the final MoE output. Otherwise, returns intermediate results (gemm2_output, expert_weights, expanded_idx_to_permuted_idx) that need further processing. """ - if tile_tokens_dim is not None: - logger.warning_once( - "tile_tokens_dim in trtllm_fp4_block_scale_routed_moe is planned for deprecation " - "in a future release. Please remove it from your code as tile_tokens_dim will no " - "longer be supported after v0.5.0." - ) return get_trtllm_moe_sm100_module().trtllm_fp4_block_scale_moe( None, topk_ids, diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py index 747946fc09..35f4ad61e7 100644 --- a/tests/moe/test_trtllm_gen_fused_moe.py +++ b/tests/moe/test_trtllm_gen_fused_moe.py @@ -208,7 +208,6 @@ def _run_moe_computation(self, runtime_args): local_expert_offset=0, local_num_experts=self.config["num_experts"], routed_scaling_factor=self.config["routed_scaling"], - tile_tokens_dim=None, routing_method_type=self.config["routing_method_type"], gated_act_type=self.config["gated_act_type"], do_finalize=True, @@ -799,7 +798,6 @@ def call_moe( 0, num_experts, routed_scaling, - None, routing_method_type, use_shuffled_weight=static_data["use_shuffled_weight"], weight_layout=static_data["weight_layout"], @@ -979,7 +977,6 @@ def call_moe( routed_scaling, routing_method_type == RoutingMethodType.Llama4, # Use_routing_scales_on_input - None, routing_method_type, tune_max_num_tokens=TUNE_MAX_NUM_TOKENS, ) diff --git a/tests/moe/test_trtllm_gen_routed_fused_moe.py b/tests/moe/test_trtllm_gen_routed_fused_moe.py index be39bda225..fb3feba4b7 100644 --- a/tests/moe/test_trtllm_gen_routed_fused_moe.py +++ b/tests/moe/test_trtllm_gen_routed_fused_moe.py @@ -180,7 +180,6 @@ def test_trtllm_gen_routed_fused_moe( 0, # local_expert_offset num_experts, None, # routed_scaling_factor - None, # tile_tokens_dim routing_method_type.value, True, # do_finalize enable_pdl, @@ -234,7 +233,6 @@ def test_trtllm_gen_routed_fused_moe( 0, # local_expert_offset num_experts, None, # routed_scaling_factor - None, # tile_tokens_dim routing_method_type.value, True, # do_finalize enable_pdl,