Refine arguments

yuantailing · yuantailing · commit 7816b85cc700 · 2025-11-11T04:41:55.000Z
Signed-off-by: Tailing Yuan &lt;yuantailing@gmail.com&gt;
diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
@@ -15,7 +15,7 @@ pip install -e ../..
 **Step 3:** In the container, run benchmarks and generate profiles:
 
 ```bash
-# Run DeepSeek-R1
+# Run DeepSeek-R1 NVFP4
 NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
 
@@ -24,7 +24,7 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSee
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
 
 # Run DeepSeek-V3.2-Exp with 32k context length
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --max-num-tokens $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
 
 # Run with attention TP
@@ -76,7 +76,7 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
 
 ```bash
-# Run DeepSeek-R1 with wide ep: uses MNNVL A2A if applicable
+# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
 SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
 
 # Run with attention TP and TRTLLMGen
@@ -93,3 +93,9 @@ SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config
 ## Parse profiles
 
 Coming soon.
+
+## Trouble shooting
+
+1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
+
+   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM`, `--moe-backend TRTLLM`, or `--moe-backend WIDEEP` option.
diff --git a/examples/layer_wise_benchmarks/config_ctx.yaml b/examples/layer_wise_benchmarks/config_ctx.yaml
@@ -9,7 +9,6 @@ max_seq_len: 9220  # 8192 + 1024 + 4
 enable_attention_dp: true
 
 # Model init args
-max_num_tokens: 20480
 moe_backend: CUTLASS
 use_cuda_graph: false
 
diff --git a/examples/layer_wise_benchmarks/config_gen.yaml b/examples/layer_wise_benchmarks/config_gen.yaml
@@ -9,7 +9,6 @@ max_seq_len: 9220  # 8192 + 1024 + 4
 enable_attention_dp: true
 
 # Model init args
-max_num_tokens: 4096  # MTP3 as max
 moe_backend: CUTLASS
 use_cuda_graph: true
 
diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run_single.py
@@ -27,6 +27,7 @@ def comma_separated_ints(s):
 parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"])
 parser.add_argument("--scaled-from", type=int)
 # KV cache related args
+parser.add_argument("--max-batch-size", type=int)
 parser.add_argument("--tokens-per-block", type=int)
 parser.add_argument("--max-seq-len", type=int)
 group = parser.add_mutually_exclusive_group(required=False)
@@ -40,6 +41,7 @@ def comma_separated_ints(s):
 # Model init args
 parser.add_argument("--max-num-tokens", type=int)
 parser.add_argument("--moe-backend", type=str)
+parser.add_argument("--moe-max-num-tokens", type=int)
 group = parser.add_mutually_exclusive_group(required=False)
 group.add_argument("--use-cuda-graph",
                    action="store_true",
@@ -59,8 +61,12 @@ def comma_separated_ints(s):
     config = yaml.safe_load(f)
 del args.config_path
 for k, v in vars(args).items():
-    if v is None:
+    if v is None and k in config:
         setattr(args, k, config[k])
+if args.max_batch_size is None:
+    args.max_batch_size = args.batch_size
+if args.max_num_tokens is None:
+    args.max_num_tokens = args.max_batch_size * args.seq_len_q
 print(args)
 
 # MPI args
@@ -72,12 +78,11 @@ def comma_separated_ints(s):
 # Create KV cache manager
 mapping = DeepSeekV3Runner.create_mapping(
     enable_attention_dp=args.enable_attention_dp)
-max_batch_size = 2048
 kv_cache_manager = DeepSeekV3Runner.create_kv_cache_manager(
     args.model,
     mapping,
     tokens_per_block=args.tokens_per_block,
-    max_batch_size=max_batch_size,
+    max_batch_size=args.max_batch_size,
     max_seq_len=args.max_seq_len,
     layer_indices=args.layer_indices)
 attn_workspace = torch.empty((0, ), device="cuda", dtype=torch.int8)
@@ -94,10 +99,11 @@ def comma_separated_ints(s):
                           scaled_from=args.scaled_from,
                           max_seq_len=args.max_seq_len,
                           max_num_tokens=args.max_num_tokens,
+                          moe_max_num_tokens=args.moe_max_num_tokens,
                           use_cuda_graph=args.use_cuda_graph)
 
 # Warm up
-assert args.batch_size <= max_batch_size
+assert args.batch_size <= args.max_batch_size
 assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len
 run_pack = runner.create_run_pack(args.run_type,
                                   batch_size=args.batch_size,
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py
@@ -142,7 +142,8 @@ class DeepSeekV3Runner:
     def __init__(self, pretrained_model_name_or_path: str, mapping: Mapping, *,
                  moe_backend: str, layer_indices: List[int],
                  scaled_from: Optional[int], max_seq_len: int,
-                 max_num_tokens: int, use_cuda_graph: bool):
+                 max_num_tokens: int, moe_max_num_tokens: int,
+                 use_cuda_graph: bool):
 
         # Temporally replace the gate class
         gate_cls_orig = tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate
@@ -158,7 +159,7 @@ def __init__(self, pretrained_model_name_or_path: str, mapping: Mapping, *,
             sparse_attention_config=None,  # To be loaded from config
             max_num_tokens=max_num_tokens,
             max_seq_len=max_seq_len,
-            moe_max_num_tokens=None,
+            moe_max_num_tokens=moe_max_num_tokens,
             moe_load_balancer=None,
             lora_config=None,
             allreduce_strategy=AllReduceStrategy.AUTO,