Skip to content

Commit 7816b85

Browse files
committed
Refine arguments
Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
1 parent 1ccb799 commit 7816b85

File tree

5 files changed

+22
-11
lines changed

5 files changed

+22
-11
lines changed

examples/layer_wise_benchmarks/README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ pip install -e ../..
1515
**Step 3:** In the container, run benchmarks and generate profiles:
1616

1717
```bash
18-
# Run DeepSeek-R1
18+
# Run DeepSeek-R1 NVFP4
1919
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
2020
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
2121

@@ -24,7 +24,7 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSee
2424
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
2525

2626
# Run DeepSeek-V3.2-Exp with 32k context length
27-
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --max-num-tokens $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
27+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
2828
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
2929

3030
# Run with attention TP
@@ -76,7 +76,7 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The
7676
**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
7777

7878
```bash
79-
# Run DeepSeek-R1 with wide ep: uses MNNVL A2A if applicable
79+
# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
8080
SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
8181

8282
# Run with attention TP and TRTLLMGen
@@ -93,3 +93,9 @@ SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config
9393
## Parse profiles
9494

9595
Coming soon.
96+
97+
## Trouble shooting
98+
99+
1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
100+
101+
The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM`, `--moe-backend TRTLLM`, or `--moe-backend WIDEEP` option.

examples/layer_wise_benchmarks/config_ctx.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ max_seq_len: 9220 # 8192 + 1024 + 4
99
enable_attention_dp: true
1010

1111
# Model init args
12-
max_num_tokens: 20480
1312
moe_backend: CUTLASS
1413
use_cuda_graph: false
1514

examples/layer_wise_benchmarks/config_gen.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ max_seq_len: 9220 # 8192 + 1024 + 4
99
enable_attention_dp: true
1010

1111
# Model init args
12-
max_num_tokens: 4096 # MTP3 as max
1312
moe_backend: CUTLASS
1413
use_cuda_graph: true
1514

examples/layer_wise_benchmarks/run_single.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def comma_separated_ints(s):
2727
parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"])
2828
parser.add_argument("--scaled-from", type=int)
2929
# KV cache related args
30+
parser.add_argument("--max-batch-size", type=int)
3031
parser.add_argument("--tokens-per-block", type=int)
3132
parser.add_argument("--max-seq-len", type=int)
3233
group = parser.add_mutually_exclusive_group(required=False)
@@ -40,6 +41,7 @@ def comma_separated_ints(s):
4041
# Model init args
4142
parser.add_argument("--max-num-tokens", type=int)
4243
parser.add_argument("--moe-backend", type=str)
44+
parser.add_argument("--moe-max-num-tokens", type=int)
4345
group = parser.add_mutually_exclusive_group(required=False)
4446
group.add_argument("--use-cuda-graph",
4547
action="store_true",
@@ -59,8 +61,12 @@ def comma_separated_ints(s):
5961
config = yaml.safe_load(f)
6062
del args.config_path
6163
for k, v in vars(args).items():
62-
if v is None:
64+
if v is None and k in config:
6365
setattr(args, k, config[k])
66+
if args.max_batch_size is None:
67+
args.max_batch_size = args.batch_size
68+
if args.max_num_tokens is None:
69+
args.max_num_tokens = args.max_batch_size * args.seq_len_q
6470
print(args)
6571

6672
# MPI args
@@ -72,12 +78,11 @@ def comma_separated_ints(s):
7278
# Create KV cache manager
7379
mapping = DeepSeekV3Runner.create_mapping(
7480
enable_attention_dp=args.enable_attention_dp)
75-
max_batch_size = 2048
7681
kv_cache_manager = DeepSeekV3Runner.create_kv_cache_manager(
7782
args.model,
7883
mapping,
7984
tokens_per_block=args.tokens_per_block,
80-
max_batch_size=max_batch_size,
85+
max_batch_size=args.max_batch_size,
8186
max_seq_len=args.max_seq_len,
8287
layer_indices=args.layer_indices)
8388
attn_workspace = torch.empty((0, ), device="cuda", dtype=torch.int8)
@@ -94,10 +99,11 @@ def comma_separated_ints(s):
9499
scaled_from=args.scaled_from,
95100
max_seq_len=args.max_seq_len,
96101
max_num_tokens=args.max_num_tokens,
102+
moe_max_num_tokens=args.moe_max_num_tokens,
97103
use_cuda_graph=args.use_cuda_graph)
98104

99105
# Warm up
100-
assert args.batch_size <= max_batch_size
106+
assert args.batch_size <= args.max_batch_size
101107
assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len
102108
run_pack = runner.create_run_pack(args.run_type,
103109
batch_size=args.batch_size,

tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ class DeepSeekV3Runner:
142142
def __init__(self, pretrained_model_name_or_path: str, mapping: Mapping, *,
143143
moe_backend: str, layer_indices: List[int],
144144
scaled_from: Optional[int], max_seq_len: int,
145-
max_num_tokens: int, use_cuda_graph: bool):
145+
max_num_tokens: int, moe_max_num_tokens: int,
146+
use_cuda_graph: bool):
146147

147148
# Temporally replace the gate class
148149
gate_cls_orig = tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate
@@ -158,7 +159,7 @@ def __init__(self, pretrained_model_name_or_path: str, mapping: Mapping, *,
158159
sparse_attention_config=None, # To be loaded from config
159160
max_num_tokens=max_num_tokens,
160161
max_seq_len=max_seq_len,
161-
moe_max_num_tokens=None,
162+
moe_max_num_tokens=moe_max_num_tokens,
162163
moe_load_balancer=None,
163164
lora_config=None,
164165
allreduce_strategy=AllReduceStrategy.AUTO,

0 commit comments

Comments
 (0)