From 6bd3d7e6b91b3edf4b1cdab1fbca040c548a0720 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 19 Nov 2025 05:06:26 +0000 Subject: [PATCH 01/16] Update readme Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 20 +++++++++------- .../slurm_init_containers.sh | 3 ++- .../slurm_query_container_name.sh | 24 +++++++++++++++++++ 3 files changed, 38 insertions(+), 9 deletions(-) create mode 100755 examples/layer_wise_benchmarks/slurm_query_container_name.sh diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index 6cb324cd126..da4f2f2d65f 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -28,12 +28,12 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSee NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 # Run with attention TP -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp # Run with attention TP and TRTLLMGen -NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM -NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM # Run with MTP3 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4 @@ -49,17 +49,19 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-back NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp # Run Qwen3-Next (balanced routing is not implemented) -NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified -NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified +NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified +NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified # Run with DeepEP A2A -NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP -NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_ctx.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_gen.yaml --moe-backend WIDEEP ``` ### Run with Slurm -> Tips: If you have a running job with environment installed, please skip step 1 and 2 and go straight to step 3. In this case, your job must be run with `--container-name aaa`, and if the container name is not "layer_wise_benchmarks" please `export CONTAINER_NAME=aaa`. +> Tips: +> 1. If you have a running Slurm job, please skip step 1 and go straight to step 2 and 3. +> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2 and run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_query_container_name.sh)` to get it. **Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`: @@ -77,6 +79,8 @@ SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once. +> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env. + **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` ≤ the number of allocated nodes: ```bash diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh index 59d783b183a..08e77c5623a 100755 --- a/examples/layer_wise_benchmarks/slurm_init_containers.sh +++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh @@ -3,6 +3,7 @@ set -euo pipefail # CONTAINER_IMAGE= +CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks} CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..") if [ "${SLURM_JOB_ID:-}" == "" ]; then @@ -46,7 +47,7 @@ set -x srun -N "$NODES" \ --ntasks-per-node 1 \ --container-image "$CONTAINER_IMAGE" \ - --container-name "layer_wise_benchmarks" \ + --container-name "$CONTAINER_NAME" \ --container-mounts "$CONTAINER_MOUNTS" \ --container-workdir "$WORKDIR" \ bash -c "pip install -U packaging && diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh new file mode 100755 index 00000000000..2445ceaa37c --- /dev/null +++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -euo pipefail + +prefix="pyxis_${SLURM_JOB_ID}_" +matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true) +count=$(printf "%s\n" "$matches" | wc -l) + +if [ "$count" -eq 0 ]; then + echo "Error: No container found" >&2 + exit 1 +fi + +if [ "$count" -gt 1 ]; then + echo "Error: Multiple containers found" >&2 + for match in "$matches"; do + echo "- ${match#$prefix}" >&2 + done + exit 1 +fi + +suffix=${matches#$prefix} +echo "Container name: $suffix" >&2 +echo "$suffix" From a2a840dd0683de4975589f472b25c5947efdbb1a Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 19 Nov 2025 07:01:20 +0000 Subject: [PATCH 02/16] Support list arguments Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/run_single.py | 152 ++++++++++++------- 1 file changed, 100 insertions(+), 52 deletions(-) diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run_single.py index 77b95c35a6c..d48e9688091 100644 --- a/examples/layer_wise_benchmarks/run_single.py +++ b/examples/layer_wise_benchmarks/run_single.py @@ -1,4 +1,5 @@ import argparse +import itertools import numpy as np import nvtx @@ -15,6 +16,10 @@ def comma_separated_ints(s): return [int(x) for x in s.split(",")] +def comma_separated_floats(s): + return [float(x) for x in s.split(",")] + + # Parse cmdline parser = argparse.ArgumentParser() parser.add_argument("config_path", type=str) @@ -48,17 +53,37 @@ def comma_separated_ints(s): parser.add_argument("--seq-len-kv-cache", type=int) parser.add_argument("--balance-method", type=str) parser.add_argument("--balance-ratio", type=float) +# Batched run args +parser.add_argument("--batch-size-list", type=comma_separated_ints) +parser.add_argument("--seq-len-q-list", type=comma_separated_ints) +parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints) +parser.add_argument("--balance-ratio-list", type=comma_separated_floats) args = parser.parse_args() +# Load YAML file with open(args.config_path) as f: config = yaml.safe_load(f) del args.config_path for k, v in vars(args).items(): if v is None and k in config: setattr(args, k, config[k]) +# Set list arguments +if args.batch_size_list is None: + args.batch_size_list = [args.batch_size] +del args.batch_size +if args.seq_len_q_list is None: + args.seq_len_q_list = [args.seq_len_q] +del args.seq_len_q +if args.seq_len_kv_cache_list is None: + args.seq_len_kv_cache_list = [args.seq_len_kv_cache] +del args.seq_len_kv_cache +if args.balance_ratio_list is None: + args.balance_ratio_list = [args.balance_ratio] +del args.balance_ratio +# Set default values if args.max_batch_size is None: - args.max_batch_size = args.batch_size + args.max_batch_size = max(args.batch_size_list) if args.max_num_tokens is None: - args.max_num_tokens = args.max_batch_size * args.seq_len_q + args.max_num_tokens = args.max_batch_size * max(args.seq_len_q_list) print(args) # MPI args @@ -98,60 +123,83 @@ def comma_separated_ints(s): ) # Warm up -assert args.batch_size <= args.max_batch_size -assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len -run_pack = runner.create_run_pack( - args.run_type, - batch_size=args.batch_size, - seq_len_q=args.seq_len_q, - seq_len_kv_cache=args.seq_len_kv_cache, - kv_cache_manager=kv_cache_manager, - attn_workspace=attn_workspace, -) -runner.replace_routing_method( - balance_method=BalanceMethod[args.balance_method], balance_ratio=args.balance_ratio -) -capture_stream.wait_stream(torch.cuda.current_stream()) -with torch.cuda.stream(capture_stream): - run_pack() - with autotune(): +for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( + args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list +): + assert batch_size <= args.max_batch_size + assert seq_len_q + seq_len_kv_cache <= args.max_seq_len + run_pack = runner.create_run_pack( + args.run_type, + batch_size=batch_size, + seq_len_q=seq_len_q, + seq_len_kv_cache=seq_len_kv_cache, + kv_cache_manager=kv_cache_manager, + attn_workspace=attn_workspace, + ) + runner.replace_routing_method( + balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio + ) + capture_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(capture_stream): run_pack() -torch.cuda.current_stream().wait_stream(capture_stream) + with autotune(): + run_pack() + torch.cuda.current_stream().wait_stream(capture_stream) torch.cuda.synchronize() -# Profile: capture graph and replay it torch.cuda.cudart().cudaProfilerStart() -if args.use_cuda_graph: - with with_multi_stream(True): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): - run_pack() +for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( + args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list +): + # Profile: capture graph and replay it + run_pack = runner.create_run_pack( + args.run_type, + batch_size=batch_size, + seq_len_q=seq_len_q, + seq_len_kv_cache=seq_len_kv_cache, + kv_cache_manager=kv_cache_manager, + attn_workspace=attn_workspace, + ) + runner.replace_routing_method( + balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio + ) + if args.use_cuda_graph: + with with_multi_stream(True): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): + run_pack() -warmup_times = 20 -run_times = 100 -events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)] -for i in range(warmup_times + run_times): - events[i].record() - with nvtx.annotate(f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"): - if args.use_cuda_graph: - g.replay() - else: - run_pack() -events[-1].record() -torch.cuda.synchronize() - -# Print statistics -# Print before `cudaProfilerStop` to ensure messages are included in the profile -time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])] -time_list = time_list[warmup_times:] -print( - f"[RANK {rank}]" - f" min {np.min(time_list) * 1000:.1f}" - f" max {np.max(time_list) * 1000:.1f}" - f" mean {np.mean(time_list) * 1000:.1f}" - f" median {np.median(time_list) * 1000:.1f}" - f" P90 {np.percentile(time_list, 90) * 1000:.1f}" - f" (us)" -) + warmup_times = 20 + run_times = 100 + events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)] + for i in range(warmup_times + run_times): + events[i].record() + balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2f}" + with nvtx.annotate( + f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" + ): + if args.use_cuda_graph: + g.replay() + else: + run_pack() + events[-1].record() + torch.cuda.synchronize() + # Print statistics + # Print before `cudaProfilerStop` to ensure messages are included in the profile + time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])] + time_list = time_list[warmup_times:] + print( + f"[RANK {rank}]" + f" batch_size {batch_size}" + f" seq_len_q {seq_len_q}" + f" seq_len_kv_cache {seq_len_kv_cache}" + + ("" if balance_ratio is None else f" balance_ratio {balance_ratio:.2f}") + + f" min {np.min(time_list) * 1000:.1f}" + f" max {np.max(time_list) * 1000:.1f}" + f" mean {np.mean(time_list) * 1000:.1f}" + f" median {np.median(time_list) * 1000:.1f}" + f" P90 {np.percentile(time_list, 90) * 1000:.1f}" + f" (us)" + ) torch.cuda.cudart().cudaProfilerStop() From 8adb6d9657785af34cb101e7a50e83422ddae6e6 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 19 Nov 2025 08:13:46 +0000 Subject: [PATCH 03/16] Add batched run Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 67 ++++++++++++------- .../{run_single.py => run.py} | 0 .../{run_single.sh => run.sh} | 11 +-- .../tools/test_layer_wise_benchmarks.py | 10 +-- 4 files changed, 54 insertions(+), 34 deletions(-) rename examples/layer_wise_benchmarks/{run_single.py => run.py} (100%) rename examples/layer_wise_benchmarks/{run_single.sh => run.sh} (74%) diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index da4f2f2d65f..aca475662e8 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -2,7 +2,7 @@ ## Generate profiles -### Run with MPI +### Run with OpenMPI **Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI. @@ -16,45 +16,45 @@ pip install -e ../.. ```bash # Run DeepSeek-R1 NVFP4 -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml # Run DeepSeek-V3.2-Exp -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM # Run DeepSeek-V3.2-Exp with 32k context length -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 # Run with attention TP -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp # Run with attention TP and TRTLLMGen -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM # Run with MTP3 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4 # Run 4 layers -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8 # Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp # Run Qwen3-Next (balanced routing is not implemented) -NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified -NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified +NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified +NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified # Run with DeepEP A2A -NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_ctx.yaml --moe-backend WIDEEP -NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_gen.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP ``` ### Run with Slurm @@ -85,17 +85,34 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The ```bash # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP # Run with attention TP and TRTLLMGen -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM # Run with DeepEPLowLatency -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP # You can run 4-GPU and 8-GPU tasks without reallocate the slurm job -SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run_single.sh config_ctx.yaml -SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config_ctx.yaml +SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml +SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.yaml +``` + +### Batched run + +By specifying `--batch-size-list` on the command line (or `batch_size_list` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. When `--batch-size-list` is set, the value of `--batch-size` is ignored. + +Supported list arguments: +- `--batch-size-list` (or `batch_size_list` in YAML) +- `--seq-len-q-list` (or `seq_len_q_list` in YAML) +- `--seq-len-kv-cache-list` (or `seq_len_kv_cache_list` in YAML) +- `--balance-ratio-list` (or `balance_ratio_list` in YAML) + +Run with OpenMPI: + +``` +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size-list 1,2,4 --seq-len-q-list 1024,8192 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size-list 32,64,128,256,512 --seq-len-q-list 1,2,3,4 ``` ## Parse profiles diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run.py similarity index 100% rename from examples/layer_wise_benchmarks/run_single.py rename to examples/layer_wise_benchmarks/run.py diff --git a/examples/layer_wise_benchmarks/run_single.sh b/examples/layer_wise_benchmarks/run.sh similarity index 74% rename from examples/layer_wise_benchmarks/run_single.sh rename to examples/layer_wise_benchmarks/run.sh index be9aa6e5a4d..7d26041e970 100755 --- a/examples/layer_wise_benchmarks/run_single.sh +++ b/examples/layer_wise_benchmarks/run.sh @@ -13,17 +13,18 @@ if [ "$RANK" -eq 0 ]; then export TLLM_LOG_LEVEL=INFO fi +PROFILE_DIR=${PROFILE_DIR:-profiles} +mkdir -p ${PROFILE_DIR} + PROFILE=${PROFILE:-1} GPU_METRICS=${GPU_METRICS:-0} if [ "$PROFILE" -eq 1 ]; then - PROFILE_FOLDER=profiles/run_single - mkdir -p ${PROFILE_FOLDER} PROFILE_CMD="nsys profile -t cuda,nvtx -s none --cpuctxsw none --cuda-event-trace false --cuda-graph-trace node -c cudaProfilerApi --capture-range-end stop - -o ${PROFILE_FOLDER}/run_single_ep${WORLD_SIZE}_rank${RANK}.nsys-rep + -o ${PROFILE_DIR}/report_ep${WORLD_SIZE}_rank${RANK}.nsys-rep --force-overwrite true" if [ "$GPU_METRICS" -eq 1 ]; then PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK @@ -34,4 +35,6 @@ else fi set -x -$PROFILE_CMD python3 -u run_single.py "$@" +$PROFILE_CMD bash -c \ + "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_ep${WORLD_SIZE}_rank${RANK}.log\"" \ + bash "$@" diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py index 14a02a9ae07..bedd1cca795 100644 --- a/tests/unittest/tools/test_layer_wise_benchmarks.py +++ b/tests/unittest/tools/test_layer_wise_benchmarks.py @@ -14,7 +14,7 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size): check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", @@ -35,7 +35,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size): check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", @@ -59,7 +59,7 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size): check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-V3.2-Exp-hf", @@ -82,7 +82,7 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size): check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_gen.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", @@ -106,7 +106,7 @@ def test_qwen3_next_gen_tep(llm_root, world_size): check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_gen.yaml", "--model", model_root / "Qwen3" / "Qwen3-Next-80B-A3B-Instruct", From a6a8c5627182165502b8ed8ce8c418fc5328e343 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 19 Nov 2025 10:14:03 +0000 Subject: [PATCH 04/16] Move events outside Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/run.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index d48e9688091..6dc2a0df354 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -58,6 +58,9 @@ def comma_separated_floats(s): parser.add_argument("--seq-len-q-list", type=comma_separated_ints) parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints) parser.add_argument("--balance-ratio-list", type=comma_separated_floats) +# Schedule +parser.add_argument("--warmup-times", type=int, default=20) +parser.add_argument("--run-times", type=int, default=100) args = parser.parse_args() # Load YAML file with open(args.config_path) as f: @@ -147,6 +150,11 @@ def comma_separated_floats(s): torch.cuda.current_stream().wait_stream(capture_stream) torch.cuda.synchronize() +events = [ + torch.cuda.Event(enable_timing=True) for _ in range(args.warmup_times + args.run_times + 1) +] +[e.record() for e in events] # Explicitly warmup events because torch is lazy + torch.cuda.cudart().cudaProfilerStart() for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list @@ -169,15 +177,13 @@ def comma_separated_floats(s): with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): run_pack() - warmup_times = 20 - run_times = 100 - events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)] - for i in range(warmup_times + run_times): + balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2f}" + nvtx_message = ( + f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" + ) + for i in range(args.warmup_times + args.run_times): events[i].record() - balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2f}" - with nvtx.annotate( - f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" - ): + with nvtx.annotate(nvtx_message): if args.use_cuda_graph: g.replay() else: @@ -188,7 +194,7 @@ def comma_separated_floats(s): # Print statistics # Print before `cudaProfilerStop` to ensure messages are included in the profile time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])] - time_list = time_list[warmup_times:] + time_list = time_list[args.warmup_times :] print( f"[RANK {rank}]" f" batch_size {batch_size}" From 042f3c05f0080493a61db701209e4de789b10ac7 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 20 Nov 2025 04:15:33 +0000 Subject: [PATCH 05/16] Fix autotune Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/run.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index 6dc2a0df354..41a1c396cd4 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -126,9 +126,22 @@ def comma_separated_floats(s): ) # Warm up -for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( - args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list -): +for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ + [ + True, + max(args.batch_size_list), + max(args.seq_len_q_list), + args.seq_len_kv_cache_list[0], + args.balance_ratio_list[0], + ], + *itertools.product( + [False], + args.batch_size_list, + args.seq_len_q_list, + args.seq_len_kv_cache_list, + args.balance_ratio_list, + ), +]: assert batch_size <= args.max_batch_size assert seq_len_q + seq_len_kv_cache <= args.max_seq_len run_pack = runner.create_run_pack( @@ -144,9 +157,10 @@ def comma_separated_floats(s): ) capture_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(capture_stream): + if autotune_flag: + with autotune(): + run_pack() run_pack() - with autotune(): - run_pack() torch.cuda.current_stream().wait_stream(capture_stream) torch.cuda.synchronize() From 991e7054f86f487094cb6bb68fbf90302dfa278a Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Thu, 20 Nov 2025 08:38:43 +0000 Subject: [PATCH 06/16] Fix balanced routing method Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 21 +- examples/layer_wise_benchmarks/run.py | 64 +++--- .../deepseekv3_runner.py | 135 +----------- .../layer_wise_benchmarks/runner_interface.py | 2 +- .../layer_wise_benchmarks/runner_utils.py | 202 +++++++++++++++++- .../tools/test_layer_wise_benchmarks.py | 1 + 6 files changed, 250 insertions(+), 175 deletions(-) diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index aca475662e8..19f634a73f7 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -32,8 +32,8 @@ NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp # Run with attention TP and TRTLLMGen -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM -NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified # Run with MTP3 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4 @@ -49,12 +49,21 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp # Run Qwen3-Next (balanced routing is not implemented) -NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified -NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified +NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4 +NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512 # Run with DeepEP A2A NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP + +# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank +# Note: if balance ratio is 0, ignore activating all experts +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5 + +# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5 ``` ### Run with Slurm @@ -87,8 +96,8 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP -# Run with attention TP and TRTLLMGen -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +# Run with TRTLLMGen +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM # Run with DeepEPLowLatency SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index 41a1c396cd4..a1d51877047 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -152,16 +152,16 @@ def comma_separated_floats(s): kv_cache_manager=kv_cache_manager, attn_workspace=attn_workspace, ) - runner.replace_routing_method( + with runner.replace_routing_method_ctx( balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio - ) - capture_stream.wait_stream(torch.cuda.current_stream()) - with torch.cuda.stream(capture_stream): - if autotune_flag: - with autotune(): - run_pack() - run_pack() - torch.cuda.current_stream().wait_stream(capture_stream) + ): + capture_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(capture_stream): + if autotune_flag: + with autotune(): + run_pack() + run_pack() + torch.cuda.current_stream().wait_stream(capture_stream) torch.cuda.synchronize() events = [ @@ -182,27 +182,25 @@ def comma_separated_floats(s): kv_cache_manager=kv_cache_manager, attn_workspace=attn_workspace, ) - runner.replace_routing_method( + with runner.replace_routing_method_ctx( balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio - ) - if args.use_cuda_graph: - with with_multi_stream(True): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): - run_pack() - - balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2f}" - nvtx_message = ( - f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" - ) - for i in range(args.warmup_times + args.run_times): - events[i].record() - with nvtx.annotate(nvtx_message): - if args.use_cuda_graph: - g.replay() - else: - run_pack() - events[-1].record() + ): + if args.use_cuda_graph: + with with_multi_stream(True): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): + run_pack() + + balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2g}" + nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" + for i in range(args.warmup_times + args.run_times): + events[i].record() + with nvtx.annotate(nvtx_message): + if args.use_cuda_graph: + g.replay() + else: + run_pack() + events[-1].record() torch.cuda.synchronize() # Print statistics @@ -214,11 +212,11 @@ def comma_separated_floats(s): f" batch_size {batch_size}" f" seq_len_q {seq_len_q}" f" seq_len_kv_cache {seq_len_kv_cache}" - + ("" if balance_ratio is None else f" balance_ratio {balance_ratio:.2f}") - + f" min {np.min(time_list) * 1000:.1f}" - f" max {np.max(time_list) * 1000:.1f}" - f" mean {np.mean(time_list) * 1000:.1f}" + + ("" if balance_ratio is None else f" balance_ratio {balance_ratio:.2g}") + + f" mean {np.mean(time_list) * 1000:.1f}" f" median {np.median(time_list) * 1000:.1f}" + f" min {np.min(time_list) * 1000:.1f}" + f" max {np.max(time_list) * 1000:.1f}" f" P90 {np.percentile(time_list, 90) * 1000:.1f}" f" (us)" ) diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py index 307e591dd27..b2445b18b22 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py @@ -1,124 +1,16 @@ -import functools from typing import List, Optional import torch -import tensorrt_llm._torch.models.modeling_deepseekv3 from tensorrt_llm._torch.model_config import ModelConfig -from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer, DeepseekV3Gate +from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer from tensorrt_llm._torch.modules.rms_norm import RMSNorm from tensorrt_llm._torch.utils import AuxStreamType -from tensorrt_llm._utils import mpi_rank, mpi_world_size from tensorrt_llm.functional import AllReduceStrategy from tensorrt_llm.mapping import Mapping -from .runner_interface import BalanceMethod, RunnerBase -from .runner_utils import RunnerMixin, ceil_div - - -class RoutingMethod(DeepseekV3Gate): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.world_size = mpi_world_size() - self.rank = mpi_rank() - self.balance_method = BalanceMethod.NotModified - self.balance_ratio = None - - def apply(self, router_logits) -> (torch.Tensor, torch.Tensor): - token_selected_experts, token_final_scales = super().apply(router_logits) - num_experts = self.weight.shape[0] - if self.balance_method == BalanceMethod.NotModified: - pass - elif self.balance_method == BalanceMethod.Balanced: - token_selected_experts = RoutingMethod.get_balanced_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - elif self.balance_method == BalanceMethod.ImbalancedRanks: - token_selected_experts = RoutingMethod.get_all_to_one_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - self.balance_ratio, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - elif self.balance_method == BalanceMethod.ImbalancedExperts: - token_selected_experts = RoutingMethod.get_balanced_rank_imbalanced_expert_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - self.balance_ratio, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - else: - raise NotImplementedError(f"Not support balance_method {self.balance_method}") - return token_selected_experts, token_final_scales - - @staticmethod - @functools.cache - def get_balanced_selection(num_tokens, top_k, num_experts, dtype, world_size, rank): - a = torch.arange(num_tokens * world_size * top_k, dtype=dtype, device="cuda").view( - num_tokens, world_size, top_k - )[:, rank] - experts = ( - a * (num_experts // world_size + 1) + a // num_experts * (num_experts // world_size) - ) % num_experts - return experts.contiguous() - - @staticmethod - def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank): - num_tokens, top_k = imbalanced_experts.shape - dtype = imbalanced_experts.dtype - balanced_experts = RoutingMethod.get_balanced_selection( - num_tokens, top_k, num_experts, dtype, world_size, rank - ) - num_balanced_tokens = round(num_tokens * balance_ratio) - if balance_ratio != 0: - # Activate all experts - num_balanced_tokens = max( - num_balanced_tokens, ceil_div(num_experts, world_size * top_k) - ) - mixed_experts = balanced_experts.clone() - mixed_experts[num_balanced_tokens:] = imbalanced_experts[num_balanced_tokens:] - return mixed_experts - - @staticmethod - @functools.cache - def get_all_to_one_selection( - num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank - ): - assert num_experts // world_size >= top_k - imbalanced_experts = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view( - num_tokens, top_k - ) % (num_experts // world_size) - return RoutingMethod.apply_balance_ratio( - imbalanced_experts, num_experts, balance_ratio, world_size, rank - ) - - @staticmethod - @functools.cache - def get_balanced_rank_imbalanced_expert_selection( - num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank - ): - experts_per_rank = num_experts // world_size - activate_experts_per_rank = ceil_div(top_k, world_size) - a = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view(num_tokens, top_k) - narrow_experts = a % (activate_experts_per_rank * world_size) - imbalanced_experts = ( - narrow_experts * experts_per_rank % num_experts - + narrow_experts // world_size % experts_per_rank - ) - return RoutingMethod.apply_balance_ratio( - imbalanced_experts, num_experts, balance_ratio, world_size, rank - ) +from .runner_interface import RunnerBase +from .runner_utils import RunnerMixin class DeepSeekV3Runner(RunnerMixin, RunnerBase): @@ -139,10 +31,6 @@ def __init__( moe_max_num_tokens: int, use_cuda_graph: bool, ): - # Temporally replace the gate class - gate_cls_orig = tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate - tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = RoutingMethod - self.model_config = ModelConfig.from_pretrained( pretrained_model_name_or_path, mapping=mapping, @@ -206,20 +94,3 @@ def __init__( layers[-1].next_layer_layernorm = next_layer_layernorm self.layers = layers - tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = gate_cls_orig - - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): - if self.model_config.moe_backend not in [ - "CUTLASS", - "DEEPGEMM", - "TRTLLM", - "WIDEEP", - "CUTEDSL", - ]: - raise NotImplementedError( - f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",' - f' please set balance_method to "NotModified"' - ) - for layer in self.layers: - layer.mlp.gate.balance_method = balance_method - layer.mlp.gate.balance_ratio = balance_ratio diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py index 20d672da167..9451124e20c 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py @@ -28,7 +28,7 @@ def create_run_pack( pass @abstractmethod - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): + def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float): pass @staticmethod diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py index ea7201e7f2b..accdce26c1e 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py @@ -1,4 +1,5 @@ import contextlib +import functools import os import weakref from abc import ABC, abstractmethod @@ -33,6 +34,166 @@ def round_up(a, b): return ceil_div(a, b) * b +def get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, device, world_size, rank +): + # First, each sender selects target rank + target_rank_before_mod = torch.arange(num_tokens * world_size * top_k).view( + num_tokens, world_size, top_k + ) + target_rank_before_mod += top_k * torch.arange(num_tokens).view( + num_tokens, 1, 1 + ) # Shift `top_k` ranks for the next token on each rank, to balance network traffic + target_rank = target_rank_before_mod % world_size + # Second, each receiver selects target expert + target_expert = torch.empty_like(target_rank) + for reciever_rank in range(world_size): + mask = target_rank == reciever_rank + experts_per_rank = num_experts // world_size + local_expert = torch.arange(num_tokens * top_k) % experts_per_rank + target_expert[mask] = (reciever_rank * experts_per_rank) + local_expert + token_selected_experts = target_expert[:, rank].sort(dim=-1).values + return token_selected_experts.contiguous().to(dtype=dtype, device=device) + + +get_balanced_selection = functools.cache(get_balanced_selection_no_cache) + + +def test_get_balanced_selection(): + dtype = torch.long + for num_tokens in range(1, 33): + for num_experts in range(1, 65): + print(f"{num_tokens=} {num_experts=}") + for top_k in range(1, min(11, num_experts)): + for world_size in range(1, 65): + if num_experts % world_size == 0: + tokens_per_expert = torch.zeros(num_experts) + for rank in range(world_size): + token_selected_experts = get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, "cpu", world_size, rank + ) + sorted_selection = token_selected_experts.sort(dim=-1).values + if (sorted_selection[:, :-1] == sorted_selection[:, 1:]).any(): + raise ValueError(f"duplicated experts on rank {rank}") + experts_per_rank = num_experts // world_size + tokens_per_rank = ( + (token_selected_experts // experts_per_rank) + .view(-1) + .bincount(minlength=world_size) + ) + if tokens_per_rank.max() - tokens_per_rank.min() > 1: + raise ValueError(f"tokens sent from rank {rank} is not balanced") + tokens_per_expert += token_selected_experts.view(-1).bincount( + minlength=num_experts + ) + if tokens_per_expert.max() - tokens_per_expert.min() > 1: + raise ValueError("tokens per expert is not balanced") + + +def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank): + num_tokens, top_k = imbalanced_experts.shape + dtype = imbalanced_experts.dtype + device = imbalanced_experts.device + balanced_experts = get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, device, world_size, rank + ) + if balance_ratio == 0.0: + num_balanced_tokens = 0 + else: + # Activate all experts + min_num_balanced_tokens = min(num_tokens, ceil_div(num_experts, world_size * top_k)) + num_balanced_tokens = min_num_balanced_tokens + round( + (num_tokens - min_num_balanced_tokens) * balance_ratio + ) + mixed_experts = torch.cat( + [balanced_experts[:num_balanced_tokens], imbalanced_experts[num_balanced_tokens:]] + ) + return mixed_experts + + +@functools.cache +def get_all_to_one_selection( + num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank +): + experts_per_rank = num_experts // world_size + if top_k > experts_per_rank: + raise ValueError( + "Cannot send all tokens to a single rank because `top_k > experts_per_rank`" + ) + imbalanced_experts = ( + torch.arange( + rank * num_tokens * top_k, (rank + 1) * num_tokens * top_k, dtype=dtype, device=device + ).view(num_tokens, top_k) + % experts_per_rank + ) + imbalanced_experts = imbalanced_experts.sort(dim=-1).values + return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank) + + +@functools.cache +def get_balanced_rank_imbalanced_expert_selection( + num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank +): + experts_per_rank = num_experts // world_size + active_experts_per_rank = ceil_div(top_k, world_size) + # Select expert from [0, active_experts_per_rank * world_size), + # then scale to [0, experts_per_rank * world_size) + narrow_experts = get_balanced_selection_no_cache( + num_tokens, top_k, active_experts_per_rank * world_size, dtype, device, world_size, rank + ) + imbalanced_experts = ( + narrow_experts // active_experts_per_rank * experts_per_rank + + narrow_experts % active_experts_per_rank + ) + return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank) + + +def make_balanced_routing_method( + apply_method_orig, num_experts, balance_method, balance_ratio, world_size, rank +): + def balanced_routing_method(router_logits): + token_selected_experts, token_final_scales = apply_method_orig(router_logits) + if balance_method == BalanceMethod.NotModified: + pass + elif balance_method == BalanceMethod.Balanced: + token_selected_experts = get_balanced_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + elif balance_method == BalanceMethod.ImbalancedRanks: + token_selected_experts = get_all_to_one_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + balance_ratio, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + elif balance_method == BalanceMethod.ImbalancedExperts: + token_selected_experts = get_balanced_rank_imbalanced_expert_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + balance_ratio, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + else: + raise NotImplementedError(f"Not support balance_method {balance_method}") + return token_selected_experts, token_final_scales + + return balanced_routing_method + + class RunnerMixin(ABC): @staticmethod @abstractmethod @@ -186,9 +347,44 @@ def run_pack(): return run_pack - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): - if balance_method != BalanceMethod.NotModified: - raise NotImplementedError("not support replacing routing method for this runner") + @contextlib.contextmanager + def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float): + if balance_method == BalanceMethod.NotModified: + pass + elif self.model_config.moe_backend not in [ + "CUTEDSL", + "CUTLASS", + "DEEPGEMM", + "TRTLLM", + "WIDEEP", + ]: + raise NotImplementedError( + f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",' + f' please set balance_method to "NotModified"' + ) + elif ( + self.model_config.moe_backend == "TRTLLM" + and not self.model_config.mapping.enable_attention_dp + ): + raise NotImplementedError( + 'Not support replace routing method for moe_backend "TRTLLM" with attention TP,' + ' please set balance_method to "NotModified"' + ) + apply_methods_orig = [layer.mlp.experts.routing_method.apply for layer in self.layers] + try: + for layer, apply_method_orig in zip(self.layers, apply_methods_orig): + layer.mlp.experts.routing_method.apply = make_balanced_routing_method( + apply_method_orig, + layer.mlp.experts.num_experts, + balance_method, + balance_ratio, + layer.mlp.experts.ep_size, + layer.mlp.experts.ep_rank, + ) + yield + finally: + for layer, apply_method_orig in zip(self.layers, apply_methods_orig): + layer.mlp.experts.routing_method.apply = apply_method_orig @staticmethod def create_kv_cache_manager( diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py index bedd1cca795..56d73c08952 100644 --- a/tests/unittest/tools/test_layer_wise_benchmarks.py +++ b/tests/unittest/tools/test_layer_wise_benchmarks.py @@ -41,6 +41,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size): model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", "--no-enable-attention-dp", "--moe-backend=TRTLLM", + "--balance-method=NotModified", ], cwd=llm_root / "examples" / "layer_wise_benchmarks", env={ From 23d004f2ba8bd9d65282c03b59b6c25be30c4222 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 21 Nov 2025 04:31:34 +0000 Subject: [PATCH 07/16] Update list option interface Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 16 ++++---- examples/layer_wise_benchmarks/run.py | 48 ++++++++++++------------ 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index 19f634a73f7..d0ebcf081a8 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -109,19 +109,21 @@ SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.ya ### Batched run -By specifying `--batch-size-list` on the command line (or `batch_size_list` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. When `--batch-size-list` is set, the value of `--batch-size` is ignored. +By specifying a list for `--batch-size` on the command line (or `batch_size` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. Supported list arguments: -- `--batch-size-list` (or `batch_size_list` in YAML) -- `--seq-len-q-list` (or `seq_len_q_list` in YAML) -- `--seq-len-kv-cache-list` (or `seq_len_kv_cache_list` in YAML) -- `--balance-ratio-list` (or `balance_ratio_list` in YAML) +- `--batch-size` (or `batch_size` in YAML) +- `--seq-len-q` (or `seq_len_q` in YAML) +- `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML) +- `--balance-ratio` (or `balance_ratio` in YAML) + +Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`. Run with OpenMPI: ``` -NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size-list 1,2,4 --seq-len-q-list 1024,8192 -NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size-list 32,64,128,256,512 --seq-len-q-list 1,2,3,4 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size 1,2,4 --seq-len-q 1024,8192 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size 32,64,128,256,512 --seq-len-q 1,2,3,4 ``` ## Parse profiles diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index a1d51877047..6d152fb9a75 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -48,16 +48,11 @@ def comma_separated_floats(s): group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph") parser.set_defaults(use_cuda_graph=None) # Per iteration args -parser.add_argument("--batch-size", type=int) -parser.add_argument("--seq-len-q", type=int) -parser.add_argument("--seq-len-kv-cache", type=int) +parser.add_argument("--batch-size", type=comma_separated_ints, dest="batch_size_list") +parser.add_argument("--seq-len-q", type=comma_separated_ints, dest="seq_len_q_list") +parser.add_argument("--seq-len-kv-cache", type=comma_separated_ints, dest="seq_len_kv_cache_list") parser.add_argument("--balance-method", type=str) -parser.add_argument("--balance-ratio", type=float) -# Batched run args -parser.add_argument("--batch-size-list", type=comma_separated_ints) -parser.add_argument("--seq-len-q-list", type=comma_separated_ints) -parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints) -parser.add_argument("--balance-ratio-list", type=comma_separated_floats) +parser.add_argument("--balance-ratio", type=comma_separated_floats, dest="balance_ratio_list") # Schedule parser.add_argument("--warmup-times", type=int, default=20) parser.add_argument("--run-times", type=int, default=100) @@ -67,21 +62,26 @@ def comma_separated_floats(s): config = yaml.safe_load(f) del args.config_path for k, v in vars(args).items(): - if v is None and k in config: - setattr(args, k, config[k]) -# Set list arguments -if args.batch_size_list is None: - args.batch_size_list = [args.batch_size] -del args.batch_size -if args.seq_len_q_list is None: - args.seq_len_q_list = [args.seq_len_q] -del args.seq_len_q -if args.seq_len_kv_cache_list is None: - args.seq_len_kv_cache_list = [args.seq_len_kv_cache] -del args.seq_len_kv_cache -if args.balance_ratio_list is None: - args.balance_ratio_list = [args.balance_ratio] -del args.balance_ratio + if k.endswith("_list"): + config_key = k[: -len("_list")] + if v is None and config_key in config: + v = config[config_key] + if isinstance(v, list): + pass + elif v is None or isinstance(v, (int, float)): + v = [v] + else: + raise ValueError(f'Config "{config_key}" in YAML should be a value or a list') + setattr(args, k, v) + else: + config_key = k + if v is None and config_key in config: + v = config[config_key] + setattr(args, k, v) + if config_key in config: + del config[config_key] +if config: + raise ValueError(f"Config {','.join(config.keys())} from file are not options") # Set default values if args.max_batch_size is None: args.max_batch_size = max(args.batch_size_list) From 99fa5b0e19926f348c20e9bed329a7761989ab49 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Fri, 21 Nov 2025 13:22:24 +0000 Subject: [PATCH 08/16] Add a parser (WIP) Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/parse.py | 298 ++++++++++++++++++++++++ examples/layer_wise_benchmarks/run.py | 11 + examples/layer_wise_benchmarks/run.sh | 4 +- 3 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 examples/layer_wise_benchmarks/parse.py diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py new file mode 100644 index 00000000000..b00403c532c --- /dev/null +++ b/examples/layer_wise_benchmarks/parse.py @@ -0,0 +1,298 @@ +import argparse +import bisect +import json +import re +import sqlite3 +import subprocess +from pathlib import Path + +import numpy as np +import pandas as pd + +# Parse cmdline +parser = argparse.ArgumentParser() +parser.add_argument("--profile-dir", type=str, default="profiles") +parser.add_argument("--world-size", type=int) +parser.add_argument("--rank", type=int, default=0) +group = parser.add_mutually_exclusive_group(required=False) +group.add_argument("--unknown-kernel-as-error", action="store_true", dest="unknown_kernel_as_error") +group.add_argument( + "--no-unknown-kernel-as-error", action="store_false", dest="unknown_kernel_as_error" +) +args = parser.parse_args() + + +def lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path): + if ( + not sqlite_file_path.is_file() + or nsys_rep_file_path.stat().st_mtime > sqlite_file_path.stat().st_mtime + ): + subprocess.check_call( + [ + "nsys", + "export", + "--type", + "sqlite", + "-o", + sqlite_file_path, + "--force-overwrite=true", + nsys_rep_file_path, + ] + ) + + +def shortest_common_supersequence(a, b): + # Merge two lists into their shortest common supersequence, + # so that both `a` and `b` are subsequences of the result. + # Uses dynamic programming to compute the shortest common supersequence, then reconstructs it. + m, n = len(a), len(b) + dp = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + for i in range(1, m + 1): + for j in range(1, n + 1): + if a[i - 1] == b[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1) + # Backtrack to build the merged sequence + res = [] + i, j = m, n + while i > 0 and j > 0: + if a[i - 1] == b[j - 1]: + res.append(a[i - 1]) + i -= 1 + j -= 1 + elif dp[i - 1][j] < dp[i][j - 1]: + res.append(a[i - 1]) + i -= 1 + else: + res.append(b[j - 1]) + j -= 1 + while i > 0: + res.append(a[i - 1]) + i -= 1 + while j > 0: + res.append(b[j - 1]) + j -= 1 + res.reverse() + return res + + +nsys_rep_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" +sqlite_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.sqlite" +lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path) + +conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True) + +query = "SELECT * FROM ENUM_NSYS_EVENT_TYPE" +df = pd.read_sql_query(query, conn) +event_id_NvtxDomainCreate = df[df["name"] == "NvtxDomainCreate"].iloc[0]["id"].tolist() +event_id_NvtxPushPopRange = df[df["name"] == "NvtxPushPopRange"].iloc[0]["id"].tolist() + +query = "SELECT domainId FROM NVTX_EVENTS WHERE eventType = ? AND text = ?" +df = pd.read_sql_query(query, conn, params=(event_id_NvtxDomainCreate, "NCCL")) +nccl_domain_id = -1 if df.empty else df.iloc[0]["domainId"].tolist() + +query = """SELECT T1.start, T2.value AS text + FROM NVTX_EVENTS AS T1 + JOIN StringIds AS T2 ON T1.textId = T2.id + WHERE eventType = ? AND T2.value LIKE ?""" +df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %")) +problem_start = [] +problem_set = [] +for start, text in df.itertuples(index=False): + if text.startswith("layer_wise_benchmarks args {"): + run_args = json.loads(text[len("layer_wise_benchmarks args") :]) + elif text.startswith("layer_wise_benchmarks problem_spec {"): + problem_start.append(start) + problem_set.append( + { + "spec": json.loads(text[len("layer_wise_benchmarks problem_spec") :]), + "text": "", + "runs": [], + "runs_end": [], + "ranges": [], + } + ) + +query = """SELECT T1.start, T1.end, T2.value AS text + FROM NVTX_EVENTS AS T1 + JOIN StringIds AS T2 ON T1.textId = T2.id + WHERE eventType = ? AND T2.value NOT LIKE ? AND domainId != ?""" +df = pd.read_sql_query( + query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", nccl_domain_id) +) +for start, end, text in df.itertuples(index=False): + problem_id = bisect.bisect(problem_start, start) - 1 + if re.match(r"b=\d+ s=\d+ ", text): + problem_set[problem_id]["text"] = text + problem_set[problem_id]["runs"].append(start) + problem_set[problem_id]["runs_end"].append(end) + else: + problem_set[problem_id]["ranges"].append((start, end, text)) + +query = """SELECT unified.start, unified.end, unified.demangledName, + R.start AS runtime_start, R.end AS runtime_end, + CGE2.start AS capture_start, CGE2.end AS capture_end +FROM ( + SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId + FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1 + UNION ALL + SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2 + UNION ALL + SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3 +) AS unified +JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId +LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND + CGE1.originalGraphNodeId IS NOT NULL +LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId""" +df = pd.read_sql_query(query, conn) +kernel_list = [] +for ( + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, +) in df.itertuples(index=False): + problem_id = bisect.bisect(problem_start, start) - 1 + run_id = bisect.bisect(problem_set[problem_id]["runs"], runtime_start) - 1 + if ( + run_id == -1 + or run_id == len(problem_set[problem_id]["runs"]) + or runtime_start >= problem_set[problem_id]["runs_end"][run_id] + ): + run_id = -1 + ranges = [ + text + for range_start, range_end, text in problem_set[problem_id]["ranges"] + if capture_start >= range_start and capture_end <= range_end + ] + kernel_list.append( + ( + problem_id, + run_id, + ranges, + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, + ) + ) + +query = "SELECT * FROM StringIds" +df = pd.read_sql_query(query, conn) +string_ids = dict(zip(df["id"], df["value"])) + +conn.close() + +kernel_list.sort(key=lambda t: (t[6], t[8])) +kernels = [[[] for _ in problem["runs"]] for problem in problem_set] +for ( + problem_id, + run_id, + ranges, + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, +) in kernel_list: + if run_id != -1: + kernels[problem_id][run_id].append((demangledName, start, end, ranges)) +for problem_id in range(len(kernels)): + required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]] + for run_id in range(len(kernels[problem_id])): + seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]] + assert seq == required_seq + + +parser_keywords = [ + ("Gemm", "nvjet"), + ("RMSNorm", "RMSNormKernel"), + ("Cat", "CatArrayBatchedCopy"), + ("RoPE", "applyMLARope"), + ("fmha", "fmhaSm100fKernel_Qkv"), + ("fmhaReduction", "fmhaReductionKernel"), + ("quant", "quantize_with_block_size"), + ("expandInput", "expandInputRowsKernel"), + ("computeStrides", "computeStridesTmaWarpSpecializedKernel"), + ("GroupGemm", "cutlass::device_kernel&1 | tee \"$PROFILE_DIR/report_ep${WORLD_SIZE}_rank${RANK}.log\"" \ + "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_np${WORLD_SIZE}_rank${RANK}.log\"" \ bash "$@" From 6a9ce27cee612719011b712eaac22df540fa923c Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Mon, 24 Nov 2025 13:32:58 +0000 Subject: [PATCH 09/16] Add HTML, support more models Signed-off-by: Tailing Yuan --- .gitignore | 1 + examples/layer_wise_benchmarks/parse.py | 173 ++++- examples/layer_wise_benchmarks/run.py | 5 +- examples/layer_wise_benchmarks/template.html | 714 ++++++++++++++++++ .../tools/layer_wise_benchmarks/__init__.py | 6 +- .../tools/layer_wise_benchmarks/mark_utils.py | 24 + 6 files changed, 879 insertions(+), 44 deletions(-) create mode 100644 examples/layer_wise_benchmarks/template.html create mode 100644 tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py diff --git a/.gitignore b/.gitignore index e2e8a6e58ef..78d8da20e47 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ cpp/include/tensorrt_llm/executor/version.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h .devcontainer/.env +/examples/layer_wise_benchmarks/profiles/ # User config files CMakeUserPresets.json diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py index b00403c532c..e979c6105e6 100644 --- a/examples/layer_wise_benchmarks/parse.py +++ b/examples/layer_wise_benchmarks/parse.py @@ -6,20 +6,23 @@ import subprocess from pathlib import Path +import jinja2 import numpy as np import pandas as pd # Parse cmdline parser = argparse.ArgumentParser() parser.add_argument("--profile-dir", type=str, default="profiles") -parser.add_argument("--world-size", type=int) +parser.add_argument("--world-size", "--np", type=int) parser.add_argument("--rank", type=int, default=0) group = parser.add_mutually_exclusive_group(required=False) -group.add_argument("--unknown-kernel-as-error", action="store_true", dest="unknown_kernel_as_error") +group.add_argument("--error-on-unknown-kernel", action="store_true", dest="error_on_unknown_kernel") group.add_argument( - "--no-unknown-kernel-as-error", action="store_false", dest="unknown_kernel_as_error" + "--no-error-on-unknown-kernel", action="store_false", dest="error_on_unknown_kernel" ) +parser.set_defaults(error_on_unknown_kernel=None) args = parser.parse_args() +print(args) def lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path): @@ -81,8 +84,10 @@ def shortest_common_supersequence(a, b): return res -nsys_rep_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" -sqlite_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.sqlite" +profile_dir = Path(args.profile_dir) +nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" +sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite" +html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html" lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path) conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True) @@ -121,12 +126,15 @@ def shortest_common_supersequence(a, b): query = """SELECT T1.start, T1.end, T2.value AS text FROM NVTX_EVENTS AS T1 JOIN StringIds AS T2 ON T1.textId = T2.id - WHERE eventType = ? AND T2.value NOT LIKE ? AND domainId != ?""" + WHERE eventType = ? AND T2.value NOT LIKE ? AND T2.value NOT LIKE ? AND domainId != ?""" df = pd.read_sql_query( - query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", nccl_domain_id) + query, + conn, + params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", "[DG]%", nccl_domain_id), ) for start, end, text in df.itertuples(index=False): problem_id = bisect.bisect(problem_start, start) - 1 + assert problem_id != -1 if re.match(r"b=\d+ s=\d+ ", text): problem_set[problem_id]["text"] = text problem_set[problem_id]["runs"].append(start) @@ -134,19 +142,23 @@ def shortest_common_supersequence(a, b): else: problem_set[problem_id]["ranges"].append((start, end, text)) -query = """SELECT unified.start, unified.end, unified.demangledName, +query = """SELECT name FROM sqlite_master WHERE type = ?""" +df = pd.read_sql_query(query, conn, params=("table",)) +tables = df["name"].tolist() +unified_subquery = """SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId + FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1""" +if "CUPTI_ACTIVITY_KIND_MEMCPY" in tables: + unified_subquery += """ UNION ALL + SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2""" +if "CUPTI_ACTIVITY_KIND_MEMSET" in tables: + unified_subquery += """ UNION ALL + SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3""" +query = f"""SELECT unified.start, unified.end, unified.demangledName, R.start AS runtime_start, R.end AS runtime_end, CGE2.start AS capture_start, CGE2.end AS capture_end -FROM ( - SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId - FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1 - UNION ALL - SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId - FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2 - UNION ALL - SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId - FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3 -) AS unified +FROM ({unified_subquery}) AS unified JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND CGE1.originalGraphNodeId IS NOT NULL @@ -189,6 +201,7 @@ def shortest_common_supersequence(a, b): capture_end, ) ) +# TODO: Parse CTX phases query = "SELECT * FROM StringIds" df = pd.read_sql_query(query, conn) @@ -220,22 +233,25 @@ def shortest_common_supersequence(a, b): parser_keywords = [ - ("Gemm", "nvjet"), + ("cuBLASGemm", "nvjet"), + ("splitKreduce", "splitKreduce_kernel"), + ("fusedAGemm", "fused_a_gemm_kernel"), ("RMSNorm", "RMSNormKernel"), - ("Cat", "CatArrayBatchedCopy"), - ("RoPE", "applyMLARope"), - ("fmha", "fmhaSm100fKernel_Qkv"), + ("torchCat", "CatArrayBatchedCopy"), + ("applyMLARope", "applyMLARope"), + ("fmhaSm100f", "fmhaSm100fKernel_Qkv"), ("fmhaReduction", "fmhaReductionKernel"), ("quant", "quantize_with_block_size"), + ("AllGather", "ncclDevKernel_AllGather_"), + ("ReduceScatter", "ncclDevKernel_ReduceScatter_"), + ("allreduce_oneshot", "allreduce_fusion_kernel_oneshot_lamport"), + ("allreduce_twoshot", "allreduce_fusion_kernel_twoshot_sync"), ("expandInput", "expandInputRowsKernel"), ("computeStrides", "computeStridesTmaWarpSpecializedKernel"), - ("GroupGemm", "cutlass::device_kernel", "at::native::CUDAFunctorOnSelf_add"), + ("convert_req_index", "_convert_req_index_to_global_index_kernel_with_stride_factor"), + ("preprocess_after_permute", "_preprocess_after_permute_kernel"), + ("masked_index_copy_quant", "_masked_index_copy_group_quant_fp8"), + ("swiglu_quant", "_silu_and_mul_post_quant_kernel"), + ("masked_index_gather", "masked_index_gather_kernel"), + ("finalizeMoeRouting", "tensorrt_llm::kernels::cutlass_kernels::finalizeMoeRoutingKernel<"), + ("fused_qkvzba_split", "fused_qkvzba_split_reshape_cat_kernel"), + ("causal_conv1d_update", "tensorrt_llm::kernels::causal_conv1d::causal_conv1d_update_kernel<"), + ("fused_delta_rule_update", "fused_sigmoid_gating_delta_rule_update_kernel"), + ("layer_norm_fwd_1pass", "_layer_norm_fwd_1pass_kernel"), + ("torchGatherTopK", "at::native::sbtopk::gatherTopK<"), + ("softmax_warp_forward", "softmax_warp_forward<"), + ("torchSigmoid", "at::native::sigmoid_kernel_cuda"), + ("torchMul", "at::native::binary_internal::MulFunctor<"), + ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"), ] warned_names = set() @@ -265,7 +317,7 @@ def parse_kernel_name(demangledName): if name not in warned_names: print(f"Unknown kernel name: {name}") warned_names.add(name) - if args.unknown_kernel_as_error: + if args.error_on_unknown_kernel: raise NotImplementedError(f"Unknown kernel name: {name}") return name[:20] @@ -274,21 +326,26 @@ def parse_kernel_name(demangledName): for runs in kernels: converted_seq = [] for i, (demangledName, _, _, ranges) in enumerate(runs[0]): - # TODO: Group by ranges name = parse_kernel_name(demangledName) - if ranges: - name = f"{ranges[0]}.{name}" + category = (*ranges, name) time_list = [run[i][2] - run[i][1] for run in runs] time_list = time_list[run_args["warmup_times"] :] - time = np.mean(time_list).tolist() - converted_seq.append((name, time)) + t = np.mean(time_list).tolist() + converted_seq.append((category, t)) converted_seqs.append(converted_seq) merged_title = [] for converted_seq in converted_seqs: title = [name for name, _ in converted_seq] merged_title = shortest_common_supersequence(merged_title, title) -print(merged_title) + +merged_data = [[0.0] * len(problem_set) for _ in merged_title] +for problem_id, converted_seq in enumerate(converted_seqs): + cur = 0 + for category, t in converted_seq: + cur = merged_title.index(category, cur) + merged_data[cur][problem_id] = t + cur += 1 print("Problem set:") for problem in problem_set: @@ -296,3 +353,43 @@ def parse_kernel_name(demangledName): f'- "{problem["text"]}" {len(problem["runs"])} runs' f" Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]" ) + +stack = [] +js_data = [] +js_stack = [js_data] +max_title_len = max((len(title) - 1) * 3 + len(title[-1]) for title in merged_title) +for title, time_data in zip(merged_title, merged_data): + while stack != list(title[: len(stack)]): + level_title = stack[-1] + stack.pop() + js_stack[-2].append( + { + "name": level_title, + "children": js_stack[-1], + } + ) + js_stack.pop() + while len(stack) != len(title) - 1: + level_title = title[len(stack)] + stack.append(level_title) + level = len(stack) + print("|--" * (level - 1) + level_title) + js_stack.append([]) + level = len(stack) + 1 + print( + "|--" * (level - 1) + title[-1] + " " * (max_title_len - (level - 1) * 3 - len(title[-1])), + *[f"{x / 1000:-6.2f}" for x in time_data], + ) + js_stack[-1].append( + { + "name": title[-1], + "time": [x / 1000 for x in time_data], + } + ) +# TODO: Print overlap and space +# TODO: Statistics +js_header_config = [{"name": problem["text"]} for problem in problem_set] +loader = jinja2.FileSystemLoader(Path(__file__).parent) +template = jinja2.Environment(loader=loader).get_template("template.html") +with html_file_path.open("w") as f: + f.write(template.render(headerConfig=js_header_config, rawData=js_data)) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index b2bcf1d70ae..6a074cb67d9 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -10,7 +10,7 @@ from tensorrt_llm._torch.autotuner import AutoTuner, autotune from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size -from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls +from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges def comma_separated_ints(s): @@ -97,6 +97,7 @@ def comma_separated_floats(s): torch.cuda.set_device(local_rank) # Create KV cache manager +mark_ranges() Runner = get_runner_cls(args.model) mapping = Runner.create_mapping(enable_attention_dp=args.enable_attention_dp) kv_cache_manager = Runner.create_kv_cache_manager( @@ -203,7 +204,7 @@ def comma_separated_floats(s): run_pack() balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2g}" - nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}" + nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} NP{world_size}" for i in range(args.warmup_times + args.run_times): events[i].record() with nvtx.annotate(nvtx_message): diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html new file mode 100644 index 00000000000..43f1d915a02 --- /dev/null +++ b/examples/layer_wise_benchmarks/template.html @@ -0,0 +1,714 @@ + + + + + + CUDA Kernel Latency Dashboard + + + + + +
+ Controls: + + + + (Tip: Arrows to move, +/- to toggle, Click charts to select) +
+ +
+
+ + + + +
+
+ +
+
+
Row Analysis
+
+
+
+
Column Hierarchy
+
+
+
+
+ + + + diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py index 607e110b62e..e347df3ca8e 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py @@ -1,7 +1,5 @@ +from .mark_utils import mark_ranges from .runner_factory import get_runner_cls from .runner_interface import BalanceMethod -__all__ = [ - "BalanceMethod", - "get_runner_cls", -] +__all__ = ["BalanceMethod", "get_runner_cls", "mark_ranges"] diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py new file mode 100644 index 00000000000..72625d10598 --- /dev/null +++ b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py @@ -0,0 +1,24 @@ +import nvtx + +from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3Gate +from tensorrt_llm._torch.models.modeling_qwen3_next import ( + Qwen3NextGatedDeltaNet, + Qwen3NextSparseMoeBlock, +) +from tensorrt_llm._torch.modules.attention import MLA, Attention +from tensorrt_llm._torch.modules.fused_moe.interface import MoE +from tensorrt_llm._torch.modules.gated_mlp import GatedMLP + + +def mark_ranges(): + DeepseekV3Gate.forward = nvtx.annotate("DeepseekV3Gate")(DeepseekV3Gate.forward) + Qwen3NextGatedDeltaNet.forward = nvtx.annotate("Qwen3NextGatedDeltaNet")( + Qwen3NextGatedDeltaNet.forward + ) + Qwen3NextSparseMoeBlock.forward = nvtx.annotate("Qwen3NextSparseMoeBlock")( + Qwen3NextSparseMoeBlock.forward + ) + MLA.forward = nvtx.annotate("MLA")(MLA.forward) + Attention.forward = nvtx.annotate("Attention")(Attention.forward) + MoE.forward = nvtx.annotate("MoE")(MoE.forward) + GatedMLP.forward = nvtx.annotate("GatedMLP")(GatedMLP.forward) From 4029ab5cff30921ee6cb10bf6e47c47915c359a4 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 25 Nov 2025 04:39:21 +0000 Subject: [PATCH 10/16] Add csv, add tests Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 16 +++++- examples/layer_wise_benchmarks/parse.py | 57 +++++++++++++++---- examples/layer_wise_benchmarks/template.html | 26 +++++++-- .../tools/test_layer_wise_benchmarks.py | 18 ++++++ 4 files changed, 98 insertions(+), 19 deletions(-) diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index d0ebcf081a8..33d88e4f973 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -128,7 +128,21 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID ## Parse profiles -Coming soon. +Run the following command in the container: + +```bash +python3 parse.py --world-size 4 + +# Specify the location of the .nsys-rep file +python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0 +``` + +It can parse only GEN phase profiles for now. + +You will receive three reports, each containing kernel timing statistics grouped by module: +1. A printed report on stdout +2. A CSV report at `profiles/report_np4_rank0.csv` +3. An HTML report at `profiles/report_np4_rank0.html` ## Trouble shooting diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py index e979c6105e6..af014edbe20 100644 --- a/examples/layer_wise_benchmarks/parse.py +++ b/examples/layer_wise_benchmarks/parse.py @@ -1,5 +1,6 @@ import argparse import bisect +import csv import json import re import sqlite3 @@ -15,6 +16,7 @@ parser.add_argument("--profile-dir", type=str, default="profiles") parser.add_argument("--world-size", "--np", type=int) parser.add_argument("--rank", type=int, default=0) +parser.add_argument("--warmup-times", type=int) group = parser.add_mutually_exclusive_group(required=False) group.add_argument("--error-on-unknown-kernel", action="store_true", dest="error_on_unknown_kernel") group.add_argument( @@ -87,6 +89,7 @@ def shortest_common_supersequence(a, b): profile_dir = Path(args.profile_dir) nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite" +csv_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.csv" html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html" lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path) @@ -301,6 +304,9 @@ def shortest_common_supersequence(a, b): ("torchSigmoid", "at::native::sigmoid_kernel_cuda"), ("torchMul", "at::native::binary_internal::MulFunctor<"), ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"), + ("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"), + ("routingIndicesHistogram", "routingIndicesHistogramKernel<"), + ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"), ] warned_names = set() @@ -319,19 +325,39 @@ def parse_kernel_name(demangledName): warned_names.add(name) if args.error_on_unknown_kernel: raise NotImplementedError(f"Unknown kernel name: {name}") - return name[:20] + return name[:30] converted_seqs = [] for runs in kernels: + warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times converted_seq = [] + # Kernel time for i, (demangledName, _, _, ranges) in enumerate(runs[0]): name = parse_kernel_name(demangledName) category = (*ranges, name) time_list = [run[i][2] - run[i][1] for run in runs] - time_list = time_list[run_args["warmup_times"] :] - t = np.mean(time_list).tolist() + t = np.mean(time_list[warmup_times:]).tolist() converted_seq.append((category, t)) + # Space and Overlap + overlap_list = [] + space_list = [] + for run in runs: + sorted_run = sorted(run, key=lambda op: op[1]) + last_end = sorted_run[0][1] + overlap_time = 0 + space_time = 0 + for _, start, end, _ in sorted_run: + if start > last_end: + space_time += start - last_end + else: + overlap_time += min(last_end, end) - start + last_end = max(last_end, end) + overlap_list.append(-overlap_time) + space_list.append(space_time) + converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist())) + converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist())) + converted_seq.append((("Total",), sum(t for _, t in converted_seq))) converted_seqs.append(converted_seq) merged_title = [] @@ -355,6 +381,7 @@ def parse_kernel_name(demangledName): ) stack = [] +csv_data = [["", *[problem["text"] for problem in problem_set]]] js_data = [] js_stack = [js_data] max_title_len = max((len(title) - 1) * 3 + len(title[-1]) for title in merged_title) @@ -374,20 +401,26 @@ def parse_kernel_name(demangledName): stack.append(level_title) level = len(stack) print("|--" * (level - 1) + level_title) + csv_data.append(["|--" * (level - 1) + level_title]) js_stack.append([]) level = len(stack) + 1 print( "|--" * (level - 1) + title[-1] + " " * (max_title_len - (level - 1) * 3 - len(title[-1])), - *[f"{x / 1000:-6.2f}" for x in time_data], - ) - js_stack[-1].append( - { - "name": title[-1], - "time": [x / 1000 for x in time_data], - } + *[f"{x / 1000:-6.1f}" for x in time_data], ) -# TODO: Print overlap and space -# TODO: Statistics + csv_data.append(["|--" * (level - 1) + title[-1], *[f"{x / 1000:.1f}" for x in time_data]]) + if title != ("Total",): + js_stack[-1].append( + { + "name": title[-1], + "time": [x / 1000 for x in time_data], + } + ) +# TODO: Group repeated modules +with csv_file_path.open("w", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL) + for row in csv_data: + csv_writer.writerow(row) js_header_config = [{"name": problem["text"]} for problem in problem_set] loader = jinja2.FileSystemLoader(Path(__file__).parent) template = jinja2.Environment(loader=loader).get_template("template.html") diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html index 43f1d915a02..b076291322b 100644 --- a/examples/layer_wise_benchmarks/template.html +++ b/examples/layer_wise_benchmarks/template.html @@ -4,7 +4,11 @@ CUDA Kernel Latency Dashboard - + @@ -238,6 +269,13 @@ (Tip: Arrows to move, +/- to toggle, Click charts to select) +
+
+ 🔧 Configuration (Click to expand) +
{{ runArgs }}
+
+
+
From 08b9f596ad9cc50da3514051cec2f6b31d255cba Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 25 Nov 2025 09:09:41 +0000 Subject: [PATCH 14/16] Fix according to coderabbitai's suggestions Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/README.md | 2 +- examples/layer_wise_benchmarks/slurm_query_container_name.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index 33d88e4f973..e9db0f24ac7 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -121,7 +121,7 @@ Command line arguments are comma separated, for example, `--batch-size 1,2,4`. C Run with OpenMPI: -``` +```bash NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size 1,2,4 --seq-len-q 1024,8192 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size 32,64,128,256,512 --seq-len-q 1,2,3,4 ``` diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh index 2445ceaa37c..a7f6b1ba815 100755 --- a/examples/layer_wise_benchmarks/slurm_query_container_name.sh +++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh @@ -13,9 +13,9 @@ fi if [ "$count" -gt 1 ]; then echo "Error: Multiple containers found" >&2 - for match in "$matches"; do + while IFS= read -r match; do echo "- ${match#$prefix}" >&2 - done + done <<< "$matches" exit 1 fi From f61103d052ed2bf80d5fe9465d646624fa23561d Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 25 Nov 2025 09:58:37 +0000 Subject: [PATCH 15/16] Fix select_alltoall_method_type Signed-off-by: Tailing Yuan --- tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py index fd6e679ac86..8ef5266b9e4 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py @@ -247,7 +247,7 @@ def make_select_alltoall_method_type_2(select_alltoall_method_type_orig): def select_alltoall_method_type(self): # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k` # by replacing `top_k` with `fake_top_k` - top_k = self.routing_method.top_k + top_k = self.routing_method.experts_per_token if scaled_from <= top_k: fake_top_k = mapping.moe_ep_size + 1 else: From 7e143ac01b48b71914d9db5c0b04eb6e3b6f9982 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 25 Nov 2025 10:03:56 +0000 Subject: [PATCH 16/16] Add more kernels to the parser Signed-off-by: Tailing Yuan --- examples/layer_wise_benchmarks/parse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py index 00f08aff4f4..36248d8f114 100644 --- a/examples/layer_wise_benchmarks/parse.py +++ b/examples/layer_wise_benchmarks/parse.py @@ -259,6 +259,11 @@ def shortest_common_supersequence(a, b): ("Cumsum", "computeCumsumDevice"), ("moveIndice", "moveIndiceDevice"), ("moeAllToAll", "moeAllToAllKernel"), + ("moeA2APrepareDispatch", "moe_comm::moeA2APrepareDispatchKernel"), + ("moeA2ADispatch", "moe_comm::moeA2ADispatchKernel"), + ("moeA2ASanitizeExpertIds", "moe_comm::moeA2ASanitizeExpertIdsKernel"), + ("moeA2APrepareCombine", "moe_comm::moeA2APrepareCombineKernel"), + ("moeA2ACombine", "moe_comm::moeA2ACombineKernel"), ("memsetExpertIds", "memsetExpertIdsDevice"), ("blockSum", "blockExpertPrefixSumKernel"), ("globalSum", "globalExpertPrefixSumKernel"),