From 6bd3d7e6b91b3edf4b1cdab1fbca040c548a0720 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Wed, 19 Nov 2025 05:06:26 +0000
Subject: [PATCH 01/16] Update readme

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md      | 20 +++++++++-------
 .../slurm_init_containers.sh                  |  3 ++-
 .../slurm_query_container_name.sh             | 24 +++++++++++++++++++
 3 files changed, 38 insertions(+), 9 deletions(-)
 create mode 100755 examples/layer_wise_benchmarks/slurm_query_container_name.sh

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index 6cb324cd126..da4f2f2d65f 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -28,12 +28,12 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSee
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
 
 # Run with attention TP
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp
 NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp
 
 # Run with attention TP and TRTLLMGen
-NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
-NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
 
 # Run with MTP3
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4
@@ -49,17 +49,19 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-back
 NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
 # Run Qwen3-Next (balanced routing is not implemented)
-NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
-NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
+NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified
+NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified
 
 # Run with DeepEP A2A
-NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
-NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
+NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_gen.yaml --moe-backend WIDEEP
 ```
 
 ### Run with Slurm
 
-> Tips: If you have a running job with environment installed, please skip step 1 and 2 and go straight to step 3. In this case, your job must be run with `--container-name aaa`, and if the container name is not "layer_wise_benchmarks" please `export CONTAINER_NAME=aaa`.
+> Tips:
+> 1. If you have a running Slurm job, please skip step 1 and go straight to step 2 and 3.
+> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2 and run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_query_container_name.sh)` to get it.
 
 **Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`:
 
@@ -77,6 +79,8 @@ SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh
 
 It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
 
+> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env.
+
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
 
 ```bash
diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh
index 59d783b183a..08e77c5623a 100755
--- a/examples/layer_wise_benchmarks/slurm_init_containers.sh
+++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh
@@ -3,6 +3,7 @@
 set -euo pipefail
 
 # CONTAINER_IMAGE=
+CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
 CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..")
 
 if [ "${SLURM_JOB_ID:-}" == "" ]; then
@@ -46,7 +47,7 @@ set -x
 srun -N "$NODES" \
     --ntasks-per-node 1 \
     --container-image "$CONTAINER_IMAGE" \
-    --container-name "layer_wise_benchmarks" \
+    --container-name "$CONTAINER_NAME" \
     --container-mounts "$CONTAINER_MOUNTS" \
     --container-workdir "$WORKDIR" \
 bash -c "pip install -U packaging &&
diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
new file mode 100755
index 00000000000..2445ceaa37c
--- /dev/null
+++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+prefix="pyxis_${SLURM_JOB_ID}_"
+matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true)
+count=$(printf "%s\n" "$matches" | wc -l)
+
+if [ "$count" -eq 0 ]; then
+    echo "Error: No container found" >&2
+    exit 1
+fi
+
+if [ "$count" -gt 1 ]; then
+    echo "Error: Multiple containers found" >&2
+    for match in "$matches"; do
+        echo "- ${match#$prefix}" >&2
+    done
+    exit 1
+fi
+
+suffix=${matches#$prefix}
+echo "Container name: $suffix" >&2
+echo "$suffix"

From a2a840dd0683de4975589f472b25c5947efdbb1a Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Wed, 19 Nov 2025 07:01:20 +0000
Subject: [PATCH 02/16] Support list arguments

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/run_single.py | 152 ++++++++++++-------
 1 file changed, 100 insertions(+), 52 deletions(-)

diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run_single.py
index 77b95c35a6c..d48e9688091 100644
--- a/examples/layer_wise_benchmarks/run_single.py
+++ b/examples/layer_wise_benchmarks/run_single.py
@@ -1,4 +1,5 @@
 import argparse
+import itertools
 
 import numpy as np
 import nvtx
@@ -15,6 +16,10 @@ def comma_separated_ints(s):
     return [int(x) for x in s.split(",")]
 
 
+def comma_separated_floats(s):
+    return [float(x) for x in s.split(",")]
+
+
 # Parse cmdline
 parser = argparse.ArgumentParser()
 parser.add_argument("config_path", type=str)
@@ -48,17 +53,37 @@ def comma_separated_ints(s):
 parser.add_argument("--seq-len-kv-cache", type=int)
 parser.add_argument("--balance-method", type=str)
 parser.add_argument("--balance-ratio", type=float)
+# Batched run args
+parser.add_argument("--batch-size-list", type=comma_separated_ints)
+parser.add_argument("--seq-len-q-list", type=comma_separated_ints)
+parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints)
+parser.add_argument("--balance-ratio-list", type=comma_separated_floats)
 args = parser.parse_args()
+# Load YAML file
 with open(args.config_path) as f:
     config = yaml.safe_load(f)
 del args.config_path
 for k, v in vars(args).items():
     if v is None and k in config:
         setattr(args, k, config[k])
+# Set list arguments
+if args.batch_size_list is None:
+    args.batch_size_list = [args.batch_size]
+del args.batch_size
+if args.seq_len_q_list is None:
+    args.seq_len_q_list = [args.seq_len_q]
+del args.seq_len_q
+if args.seq_len_kv_cache_list is None:
+    args.seq_len_kv_cache_list = [args.seq_len_kv_cache]
+del args.seq_len_kv_cache
+if args.balance_ratio_list is None:
+    args.balance_ratio_list = [args.balance_ratio]
+del args.balance_ratio
+# Set default values
 if args.max_batch_size is None:
-    args.max_batch_size = args.batch_size
+    args.max_batch_size = max(args.batch_size_list)
 if args.max_num_tokens is None:
-    args.max_num_tokens = args.max_batch_size * args.seq_len_q
+    args.max_num_tokens = args.max_batch_size * max(args.seq_len_q_list)
 print(args)
 
 # MPI args
@@ -98,60 +123,83 @@ def comma_separated_ints(s):
 )
 
 # Warm up
-assert args.batch_size <= args.max_batch_size
-assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len
-run_pack = runner.create_run_pack(
-    args.run_type,
-    batch_size=args.batch_size,
-    seq_len_q=args.seq_len_q,
-    seq_len_kv_cache=args.seq_len_kv_cache,
-    kv_cache_manager=kv_cache_manager,
-    attn_workspace=attn_workspace,
-)
-runner.replace_routing_method(
-    balance_method=BalanceMethod[args.balance_method], balance_ratio=args.balance_ratio
-)
-capture_stream.wait_stream(torch.cuda.current_stream())
-with torch.cuda.stream(capture_stream):
-    run_pack()
-    with autotune():
+for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product(
+    args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list
+):
+    assert batch_size <= args.max_batch_size
+    assert seq_len_q + seq_len_kv_cache <= args.max_seq_len
+    run_pack = runner.create_run_pack(
+        args.run_type,
+        batch_size=batch_size,
+        seq_len_q=seq_len_q,
+        seq_len_kv_cache=seq_len_kv_cache,
+        kv_cache_manager=kv_cache_manager,
+        attn_workspace=attn_workspace,
+    )
+    runner.replace_routing_method(
+        balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio
+    )
+    capture_stream.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(capture_stream):
         run_pack()
-torch.cuda.current_stream().wait_stream(capture_stream)
+        with autotune():
+            run_pack()
+    torch.cuda.current_stream().wait_stream(capture_stream)
 torch.cuda.synchronize()
 
-# Profile: capture graph and replay it
 torch.cuda.cudart().cudaProfilerStart()
-if args.use_cuda_graph:
-    with with_multi_stream(True):
-        g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
-            run_pack()
+for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product(
+    args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list
+):
+    # Profile: capture graph and replay it
+    run_pack = runner.create_run_pack(
+        args.run_type,
+        batch_size=batch_size,
+        seq_len_q=seq_len_q,
+        seq_len_kv_cache=seq_len_kv_cache,
+        kv_cache_manager=kv_cache_manager,
+        attn_workspace=attn_workspace,
+    )
+    runner.replace_routing_method(
+        balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio
+    )
+    if args.use_cuda_graph:
+        with with_multi_stream(True):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
+                run_pack()
 
-warmup_times = 20
-run_times = 100
-events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)]
-for i in range(warmup_times + run_times):
-    events[i].record()
-    with nvtx.annotate(f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"):
-        if args.use_cuda_graph:
-            g.replay()
-        else:
-            run_pack()
-events[-1].record()
-torch.cuda.synchronize()
-
-# Print statistics
-#   Print before `cudaProfilerStop` to ensure messages are included in the profile
-time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])]
-time_list = time_list[warmup_times:]
-print(
-    f"[RANK {rank}]"
-    f"  min {np.min(time_list) * 1000:.1f}"
-    f"  max {np.max(time_list) * 1000:.1f}"
-    f"  mean {np.mean(time_list) * 1000:.1f}"
-    f"  median {np.median(time_list) * 1000:.1f}"
-    f"  P90 {np.percentile(time_list, 90) * 1000:.1f}"
-    f"  (us)"
-)
+    warmup_times = 20
+    run_times = 100
+    events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)]
+    for i in range(warmup_times + run_times):
+        events[i].record()
+        balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2f}"
+        with nvtx.annotate(
+            f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
+        ):
+            if args.use_cuda_graph:
+                g.replay()
+            else:
+                run_pack()
+    events[-1].record()
+    torch.cuda.synchronize()
 
+    # Print statistics
+    #   Print before `cudaProfilerStop` to ensure messages are included in the profile
+    time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])]
+    time_list = time_list[warmup_times:]
+    print(
+        f"[RANK {rank}]"
+        f"  batch_size {batch_size}"
+        f"  seq_len_q {seq_len_q}"
+        f"  seq_len_kv_cache {seq_len_kv_cache}"
+        + ("" if balance_ratio is None else f"  balance_ratio {balance_ratio:.2f}")
+        + f"  min {np.min(time_list) * 1000:.1f}"
+        f"  max {np.max(time_list) * 1000:.1f}"
+        f"  mean {np.mean(time_list) * 1000:.1f}"
+        f"  median {np.median(time_list) * 1000:.1f}"
+        f"  P90 {np.percentile(time_list, 90) * 1000:.1f}"
+        f"  (us)"
+    )
 torch.cuda.cudart().cudaProfilerStop()

From 8adb6d9657785af34cb101e7a50e83422ddae6e6 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Wed, 19 Nov 2025 08:13:46 +0000
Subject: [PATCH 03/16] Add batched run

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md      | 67 ++++++++++++-------
 .../{run_single.py => run.py}                 |  0
 .../{run_single.sh => run.sh}                 | 11 +--
 .../tools/test_layer_wise_benchmarks.py       | 10 +--
 4 files changed, 54 insertions(+), 34 deletions(-)
 rename examples/layer_wise_benchmarks/{run_single.py => run.py} (100%)
 rename examples/layer_wise_benchmarks/{run_single.sh => run.sh} (74%)

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index da4f2f2d65f..aca475662e8 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -2,7 +2,7 @@
 
 ## Generate profiles
 
-### Run with MPI
+### Run with OpenMPI
 
 **Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
 
@@ -16,45 +16,45 @@ pip install -e ../..
 
 ```bash
 # Run DeepSeek-R1 NVFP4
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml
 
 # Run DeepSeek-V3.2-Exp
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
 
 # Run DeepSeek-V3.2-Exp with 32k context length
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
 
 # Run with attention TP
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp
 
 # Run with attention TP and TRTLLMGen
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
 
 # Run with MTP3
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
 
 # Run 4 layers
-NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8
 
 # Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
 
 # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
-NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
 # Run Qwen3-Next (balanced routing is not implemented)
-NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified
-NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified
+NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified
+NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified
 
 # Run with DeepEP A2A
-NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
-NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
+NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP
 ```
 
 ### Run with Slurm
@@ -85,17 +85,34 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The
 
 ```bash
 # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
 # Run with attention TP and TRTLLMGen
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
 
 # Run with DeepEPLowLatency
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
 # You can run 4-GPU and 8-GPU tasks without reallocate the slurm job
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.yaml
+```
+
+### Batched run
+
+By specifying `--batch-size-list` on the command line (or `batch_size_list` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. When `--batch-size-list` is set, the value of `--batch-size` is ignored.
+
+Supported list arguments:
+- `--batch-size-list` (or `batch_size_list` in YAML)
+- `--seq-len-q-list` (or `seq_len_q_list` in YAML)
+- `--seq-len-kv-cache-list` (or `seq_len_kv_cache_list` in YAML)
+- `--balance-ratio-list` (or `balance_ratio_list` in YAML)
+
+Run with OpenMPI:
+
+```
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size-list 1,2,4 --seq-len-q-list 1024,8192
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size-list 32,64,128,256,512 --seq-len-q-list 1,2,3,4
 ```
 
 ## Parse profiles
diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run.py
similarity index 100%
rename from examples/layer_wise_benchmarks/run_single.py
rename to examples/layer_wise_benchmarks/run.py
diff --git a/examples/layer_wise_benchmarks/run_single.sh b/examples/layer_wise_benchmarks/run.sh
similarity index 74%
rename from examples/layer_wise_benchmarks/run_single.sh
rename to examples/layer_wise_benchmarks/run.sh
index be9aa6e5a4d..7d26041e970 100755
--- a/examples/layer_wise_benchmarks/run_single.sh
+++ b/examples/layer_wise_benchmarks/run.sh
@@ -13,17 +13,18 @@ if [ "$RANK" -eq 0 ]; then
     export TLLM_LOG_LEVEL=INFO
 fi
 
+PROFILE_DIR=${PROFILE_DIR:-profiles}
+mkdir -p ${PROFILE_DIR}
+
 PROFILE=${PROFILE:-1}
 GPU_METRICS=${GPU_METRICS:-0}
 if [ "$PROFILE" -eq 1 ]; then
-    PROFILE_FOLDER=profiles/run_single
-    mkdir -p ${PROFILE_FOLDER}
     PROFILE_CMD="nsys profile
         -t cuda,nvtx -s none
         --cpuctxsw none --cuda-event-trace false
         --cuda-graph-trace node
         -c cudaProfilerApi --capture-range-end stop
-        -o ${PROFILE_FOLDER}/run_single_ep${WORLD_SIZE}_rank${RANK}.nsys-rep
+        -o ${PROFILE_DIR}/report_ep${WORLD_SIZE}_rank${RANK}.nsys-rep
         --force-overwrite true"
     if [ "$GPU_METRICS" -eq 1 ]; then
         PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
@@ -34,4 +35,6 @@ else
 fi
 
 set -x
-$PROFILE_CMD python3 -u run_single.py "$@"
+$PROFILE_CMD bash -c \
+    "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_ep${WORLD_SIZE}_rank${RANK}.log\"" \
+    bash "$@"
diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py
index 14a02a9ae07..bedd1cca795 100644
--- a/tests/unittest/tools/test_layer_wise_benchmarks.py
+++ b/tests/unittest/tools/test_layer_wise_benchmarks.py
@@ -14,7 +14,7 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size):
     check_call(
         [
             "./mpi_launch.sh",
-            "./run_single.sh",
+            "./run.sh",
             "config_ctx.yaml",
             "--model",
             model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2",
@@ -35,7 +35,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size):
     check_call(
         [
             "./mpi_launch.sh",
-            "./run_single.sh",
+            "./run.sh",
             "config_ctx.yaml",
             "--model",
             model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2",
@@ -59,7 +59,7 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size):
     check_call(
         [
             "./mpi_launch.sh",
-            "./run_single.sh",
+            "./run.sh",
             "config_ctx.yaml",
             "--model",
             model_root / "DeepSeek-V3.2-Exp-hf",
@@ -82,7 +82,7 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size):
     check_call(
         [
             "./mpi_launch.sh",
-            "./run_single.sh",
+            "./run.sh",
             "config_gen.yaml",
             "--model",
             model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2",
@@ -106,7 +106,7 @@ def test_qwen3_next_gen_tep(llm_root, world_size):
     check_call(
         [
             "./mpi_launch.sh",
-            "./run_single.sh",
+            "./run.sh",
             "config_gen.yaml",
             "--model",
             model_root / "Qwen3" / "Qwen3-Next-80B-A3B-Instruct",

From a6a8c5627182165502b8ed8ce8c418fc5328e343 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Wed, 19 Nov 2025 10:14:03 +0000
Subject: [PATCH 04/16] Move events outside

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/run.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index d48e9688091..6dc2a0df354 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -58,6 +58,9 @@ def comma_separated_floats(s):
 parser.add_argument("--seq-len-q-list", type=comma_separated_ints)
 parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints)
 parser.add_argument("--balance-ratio-list", type=comma_separated_floats)
+# Schedule
+parser.add_argument("--warmup-times", type=int, default=20)
+parser.add_argument("--run-times", type=int, default=100)
 args = parser.parse_args()
 # Load YAML file
 with open(args.config_path) as f:
@@ -147,6 +150,11 @@ def comma_separated_floats(s):
     torch.cuda.current_stream().wait_stream(capture_stream)
 torch.cuda.synchronize()
 
+events = [
+    torch.cuda.Event(enable_timing=True) for _ in range(args.warmup_times + args.run_times + 1)
+]
+[e.record() for e in events]  # Explicitly warmup events because torch is lazy
+
 torch.cuda.cudart().cudaProfilerStart()
 for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product(
     args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list
@@ -169,15 +177,13 @@ def comma_separated_floats(s):
             with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
                 run_pack()
 
-    warmup_times = 20
-    run_times = 100
-    events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)]
-    for i in range(warmup_times + run_times):
+    balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2f}"
+    nvtx_message = (
+        f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
+    )
+    for i in range(args.warmup_times + args.run_times):
         events[i].record()
-        balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2f}"
-        with nvtx.annotate(
-            f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
-        ):
+        with nvtx.annotate(nvtx_message):
             if args.use_cuda_graph:
                 g.replay()
             else:
@@ -188,7 +194,7 @@ def comma_separated_floats(s):
     # Print statistics
     #   Print before `cudaProfilerStop` to ensure messages are included in the profile
     time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])]
-    time_list = time_list[warmup_times:]
+    time_list = time_list[args.warmup_times :]
     print(
         f"[RANK {rank}]"
         f"  batch_size {batch_size}"

From 042f3c05f0080493a61db701209e4de789b10ac7 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Thu, 20 Nov 2025 04:15:33 +0000
Subject: [PATCH 05/16] Fix autotune

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/run.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index 6dc2a0df354..41a1c396cd4 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -126,9 +126,22 @@ def comma_separated_floats(s):
 )
 
 # Warm up
-for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product(
-    args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list
-):
+for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [
+    [
+        True,
+        max(args.batch_size_list),
+        max(args.seq_len_q_list),
+        args.seq_len_kv_cache_list[0],
+        args.balance_ratio_list[0],
+    ],
+    *itertools.product(
+        [False],
+        args.batch_size_list,
+        args.seq_len_q_list,
+        args.seq_len_kv_cache_list,
+        args.balance_ratio_list,
+    ),
+]:
     assert batch_size <= args.max_batch_size
     assert seq_len_q + seq_len_kv_cache <= args.max_seq_len
     run_pack = runner.create_run_pack(
@@ -144,9 +157,10 @@ def comma_separated_floats(s):
     )
     capture_stream.wait_stream(torch.cuda.current_stream())
     with torch.cuda.stream(capture_stream):
+        if autotune_flag:
+            with autotune():
+                run_pack()
         run_pack()
-        with autotune():
-            run_pack()
     torch.cuda.current_stream().wait_stream(capture_stream)
 torch.cuda.synchronize()
 

From 991e7054f86f487094cb6bb68fbf90302dfa278a Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Thu, 20 Nov 2025 08:38:43 +0000
Subject: [PATCH 06/16] Fix balanced routing method

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md      |  21 +-
 examples/layer_wise_benchmarks/run.py         |  64 +++---
 .../deepseekv3_runner.py                      | 135 +-----------
 .../layer_wise_benchmarks/runner_interface.py |   2 +-
 .../layer_wise_benchmarks/runner_utils.py     | 202 +++++++++++++++++-
 .../tools/test_layer_wise_benchmarks.py       |   1 +
 6 files changed, 250 insertions(+), 175 deletions(-)

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index aca475662e8..19f634a73f7 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -32,8 +32,8 @@ NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp
 
 # Run with attention TP and TRTLLMGen
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
 
 # Run with MTP3
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
@@ -49,12 +49,21 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
 # Run Qwen3-Next (balanced routing is not implemented)
-NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 4 --balance-method NotModified
-NP=2 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --batch-size 512 --balance-method NotModified
+NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4
+NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512
 
 # Run with DeepEP A2A
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP
+
+# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank
+# Note: if balance ratio is 0, ignore activating all experts
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5
+
+# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5
 ```
 
 ### Run with Slurm
@@ -87,8 +96,8 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The
 # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
 SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
-# Run with attention TP and TRTLLMGen
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+# Run with TRTLLMGen
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
 
 # Run with DeepEPLowLatency
 SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index 41a1c396cd4..a1d51877047 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -152,16 +152,16 @@ def comma_separated_floats(s):
         kv_cache_manager=kv_cache_manager,
         attn_workspace=attn_workspace,
     )
-    runner.replace_routing_method(
+    with runner.replace_routing_method_ctx(
         balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio
-    )
-    capture_stream.wait_stream(torch.cuda.current_stream())
-    with torch.cuda.stream(capture_stream):
-        if autotune_flag:
-            with autotune():
-                run_pack()
-        run_pack()
-    torch.cuda.current_stream().wait_stream(capture_stream)
+    ):
+        capture_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(capture_stream):
+            if autotune_flag:
+                with autotune():
+                    run_pack()
+            run_pack()
+        torch.cuda.current_stream().wait_stream(capture_stream)
 torch.cuda.synchronize()
 
 events = [
@@ -182,27 +182,25 @@ def comma_separated_floats(s):
         kv_cache_manager=kv_cache_manager,
         attn_workspace=attn_workspace,
     )
-    runner.replace_routing_method(
+    with runner.replace_routing_method_ctx(
         balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio
-    )
-    if args.use_cuda_graph:
-        with with_multi_stream(True):
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
-                run_pack()
-
-    balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2f}"
-    nvtx_message = (
-        f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
-    )
-    for i in range(args.warmup_times + args.run_times):
-        events[i].record()
-        with nvtx.annotate(nvtx_message):
-            if args.use_cuda_graph:
-                g.replay()
-            else:
-                run_pack()
-    events[-1].record()
+    ):
+        if args.use_cuda_graph:
+            with with_multi_stream(True):
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
+                    run_pack()
+
+        balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2g}"
+        nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
+        for i in range(args.warmup_times + args.run_times):
+            events[i].record()
+            with nvtx.annotate(nvtx_message):
+                if args.use_cuda_graph:
+                    g.replay()
+                else:
+                    run_pack()
+        events[-1].record()
     torch.cuda.synchronize()
 
     # Print statistics
@@ -214,11 +212,11 @@ def comma_separated_floats(s):
         f"  batch_size {batch_size}"
         f"  seq_len_q {seq_len_q}"
         f"  seq_len_kv_cache {seq_len_kv_cache}"
-        + ("" if balance_ratio is None else f"  balance_ratio {balance_ratio:.2f}")
-        + f"  min {np.min(time_list) * 1000:.1f}"
-        f"  max {np.max(time_list) * 1000:.1f}"
-        f"  mean {np.mean(time_list) * 1000:.1f}"
+        + ("" if balance_ratio is None else f"  balance_ratio {balance_ratio:.2g}")
+        + f"  mean {np.mean(time_list) * 1000:.1f}"
         f"  median {np.median(time_list) * 1000:.1f}"
+        f"  min {np.min(time_list) * 1000:.1f}"
+        f"  max {np.max(time_list) * 1000:.1f}"
         f"  P90 {np.percentile(time_list, 90) * 1000:.1f}"
         f"  (us)"
     )
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py
index 307e591dd27..b2445b18b22 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py
@@ -1,124 +1,16 @@
-import functools
 from typing import List, Optional
 
 import torch
 
-import tensorrt_llm._torch.models.modeling_deepseekv3
 from tensorrt_llm._torch.model_config import ModelConfig
-from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer, DeepseekV3Gate
+from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer
 from tensorrt_llm._torch.modules.rms_norm import RMSNorm
 from tensorrt_llm._torch.utils import AuxStreamType
-from tensorrt_llm._utils import mpi_rank, mpi_world_size
 from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.mapping import Mapping
 
-from .runner_interface import BalanceMethod, RunnerBase
-from .runner_utils import RunnerMixin, ceil_div
-
-
-class RoutingMethod(DeepseekV3Gate):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.world_size = mpi_world_size()
-        self.rank = mpi_rank()
-        self.balance_method = BalanceMethod.NotModified
-        self.balance_ratio = None
-
-    def apply(self, router_logits) -> (torch.Tensor, torch.Tensor):
-        token_selected_experts, token_final_scales = super().apply(router_logits)
-        num_experts = self.weight.shape[0]
-        if self.balance_method == BalanceMethod.NotModified:
-            pass
-        elif self.balance_method == BalanceMethod.Balanced:
-            token_selected_experts = RoutingMethod.get_balanced_selection(
-                token_selected_experts.shape[0],
-                token_selected_experts.shape[1],
-                num_experts,
-                token_selected_experts.dtype,
-                self.world_size,
-                self.rank,
-            )
-        elif self.balance_method == BalanceMethod.ImbalancedRanks:
-            token_selected_experts = RoutingMethod.get_all_to_one_selection(
-                token_selected_experts.shape[0],
-                token_selected_experts.shape[1],
-                num_experts,
-                self.balance_ratio,
-                token_selected_experts.dtype,
-                self.world_size,
-                self.rank,
-            )
-        elif self.balance_method == BalanceMethod.ImbalancedExperts:
-            token_selected_experts = RoutingMethod.get_balanced_rank_imbalanced_expert_selection(
-                token_selected_experts.shape[0],
-                token_selected_experts.shape[1],
-                num_experts,
-                self.balance_ratio,
-                token_selected_experts.dtype,
-                self.world_size,
-                self.rank,
-            )
-        else:
-            raise NotImplementedError(f"Not support balance_method {self.balance_method}")
-        return token_selected_experts, token_final_scales
-
-    @staticmethod
-    @functools.cache
-    def get_balanced_selection(num_tokens, top_k, num_experts, dtype, world_size, rank):
-        a = torch.arange(num_tokens * world_size * top_k, dtype=dtype, device="cuda").view(
-            num_tokens, world_size, top_k
-        )[:, rank]
-        experts = (
-            a * (num_experts // world_size + 1) + a // num_experts * (num_experts // world_size)
-        ) % num_experts
-        return experts.contiguous()
-
-    @staticmethod
-    def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank):
-        num_tokens, top_k = imbalanced_experts.shape
-        dtype = imbalanced_experts.dtype
-        balanced_experts = RoutingMethod.get_balanced_selection(
-            num_tokens, top_k, num_experts, dtype, world_size, rank
-        )
-        num_balanced_tokens = round(num_tokens * balance_ratio)
-        if balance_ratio != 0:
-            # Activate all experts
-            num_balanced_tokens = max(
-                num_balanced_tokens, ceil_div(num_experts, world_size * top_k)
-            )
-        mixed_experts = balanced_experts.clone()
-        mixed_experts[num_balanced_tokens:] = imbalanced_experts[num_balanced_tokens:]
-        return mixed_experts
-
-    @staticmethod
-    @functools.cache
-    def get_all_to_one_selection(
-        num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank
-    ):
-        assert num_experts // world_size >= top_k
-        imbalanced_experts = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view(
-            num_tokens, top_k
-        ) % (num_experts // world_size)
-        return RoutingMethod.apply_balance_ratio(
-            imbalanced_experts, num_experts, balance_ratio, world_size, rank
-        )
-
-    @staticmethod
-    @functools.cache
-    def get_balanced_rank_imbalanced_expert_selection(
-        num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank
-    ):
-        experts_per_rank = num_experts // world_size
-        activate_experts_per_rank = ceil_div(top_k, world_size)
-        a = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view(num_tokens, top_k)
-        narrow_experts = a % (activate_experts_per_rank * world_size)
-        imbalanced_experts = (
-            narrow_experts * experts_per_rank % num_experts
-            + narrow_experts // world_size % experts_per_rank
-        )
-        return RoutingMethod.apply_balance_ratio(
-            imbalanced_experts, num_experts, balance_ratio, world_size, rank
-        )
+from .runner_interface import RunnerBase
+from .runner_utils import RunnerMixin
 
 
 class DeepSeekV3Runner(RunnerMixin, RunnerBase):
@@ -139,10 +31,6 @@ def __init__(
         moe_max_num_tokens: int,
         use_cuda_graph: bool,
     ):
-        # Temporally replace the gate class
-        gate_cls_orig = tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate
-        tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = RoutingMethod
-
         self.model_config = ModelConfig.from_pretrained(
             pretrained_model_name_or_path,
             mapping=mapping,
@@ -206,20 +94,3 @@ def __init__(
             layers[-1].next_layer_layernorm = next_layer_layernorm
 
             self.layers = layers
-        tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = gate_cls_orig
-
-    def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float):
-        if self.model_config.moe_backend not in [
-            "CUTLASS",
-            "DEEPGEMM",
-            "TRTLLM",
-            "WIDEEP",
-            "CUTEDSL",
-        ]:
-            raise NotImplementedError(
-                f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",'
-                f' please set balance_method to "NotModified"'
-            )
-        for layer in self.layers:
-            layer.mlp.gate.balance_method = balance_method
-            layer.mlp.gate.balance_ratio = balance_ratio
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py
index 20d672da167..9451124e20c 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py
@@ -28,7 +28,7 @@ def create_run_pack(
         pass
 
     @abstractmethod
-    def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float):
+    def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float):
         pass
 
     @staticmethod
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
index ea7201e7f2b..accdce26c1e 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import functools
 import os
 import weakref
 from abc import ABC, abstractmethod
@@ -33,6 +34,166 @@ def round_up(a, b):
     return ceil_div(a, b) * b
 
 
+def get_balanced_selection_no_cache(
+    num_tokens, top_k, num_experts, dtype, device, world_size, rank
+):
+    # First, each sender selects target rank
+    target_rank_before_mod = torch.arange(num_tokens * world_size * top_k).view(
+        num_tokens, world_size, top_k
+    )
+    target_rank_before_mod += top_k * torch.arange(num_tokens).view(
+        num_tokens, 1, 1
+    )  # Shift `top_k` ranks for the next token on each rank, to balance network traffic
+    target_rank = target_rank_before_mod % world_size
+    # Second, each receiver selects target expert
+    target_expert = torch.empty_like(target_rank)
+    for reciever_rank in range(world_size):
+        mask = target_rank == reciever_rank
+        experts_per_rank = num_experts // world_size
+        local_expert = torch.arange(num_tokens * top_k) % experts_per_rank
+        target_expert[mask] = (reciever_rank * experts_per_rank) + local_expert
+    token_selected_experts = target_expert[:, rank].sort(dim=-1).values
+    return token_selected_experts.contiguous().to(dtype=dtype, device=device)
+
+
+get_balanced_selection = functools.cache(get_balanced_selection_no_cache)
+
+
+def test_get_balanced_selection():
+    dtype = torch.long
+    for num_tokens in range(1, 33):
+        for num_experts in range(1, 65):
+            print(f"{num_tokens=} {num_experts=}")
+            for top_k in range(1, min(11, num_experts)):
+                for world_size in range(1, 65):
+                    if num_experts % world_size == 0:
+                        tokens_per_expert = torch.zeros(num_experts)
+                        for rank in range(world_size):
+                            token_selected_experts = get_balanced_selection_no_cache(
+                                num_tokens, top_k, num_experts, dtype, "cpu", world_size, rank
+                            )
+                            sorted_selection = token_selected_experts.sort(dim=-1).values
+                            if (sorted_selection[:, :-1] == sorted_selection[:, 1:]).any():
+                                raise ValueError(f"duplicated experts on rank {rank}")
+                            experts_per_rank = num_experts // world_size
+                            tokens_per_rank = (
+                                (token_selected_experts // experts_per_rank)
+                                .view(-1)
+                                .bincount(minlength=world_size)
+                            )
+                            if tokens_per_rank.max() - tokens_per_rank.min() > 1:
+                                raise ValueError(f"tokens sent from rank {rank} is not balanced")
+                            tokens_per_expert += token_selected_experts.view(-1).bincount(
+                                minlength=num_experts
+                            )
+                        if tokens_per_expert.max() - tokens_per_expert.min() > 1:
+                            raise ValueError("tokens per expert is not balanced")
+
+
+def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank):
+    num_tokens, top_k = imbalanced_experts.shape
+    dtype = imbalanced_experts.dtype
+    device = imbalanced_experts.device
+    balanced_experts = get_balanced_selection_no_cache(
+        num_tokens, top_k, num_experts, dtype, device, world_size, rank
+    )
+    if balance_ratio == 0.0:
+        num_balanced_tokens = 0
+    else:
+        # Activate all experts
+        min_num_balanced_tokens = min(num_tokens, ceil_div(num_experts, world_size * top_k))
+        num_balanced_tokens = min_num_balanced_tokens + round(
+            (num_tokens - min_num_balanced_tokens) * balance_ratio
+        )
+    mixed_experts = torch.cat(
+        [balanced_experts[:num_balanced_tokens], imbalanced_experts[num_balanced_tokens:]]
+    )
+    return mixed_experts
+
+
+@functools.cache
+def get_all_to_one_selection(
+    num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank
+):
+    experts_per_rank = num_experts // world_size
+    if top_k > experts_per_rank:
+        raise ValueError(
+            "Cannot send all tokens to a single rank because `top_k > experts_per_rank`"
+        )
+    imbalanced_experts = (
+        torch.arange(
+            rank * num_tokens * top_k, (rank + 1) * num_tokens * top_k, dtype=dtype, device=device
+        ).view(num_tokens, top_k)
+        % experts_per_rank
+    )
+    imbalanced_experts = imbalanced_experts.sort(dim=-1).values
+    return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank)
+
+
+@functools.cache
+def get_balanced_rank_imbalanced_expert_selection(
+    num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank
+):
+    experts_per_rank = num_experts // world_size
+    active_experts_per_rank = ceil_div(top_k, world_size)
+    # Select expert from [0, active_experts_per_rank * world_size),
+    # then scale to [0, experts_per_rank * world_size)
+    narrow_experts = get_balanced_selection_no_cache(
+        num_tokens, top_k, active_experts_per_rank * world_size, dtype, device, world_size, rank
+    )
+    imbalanced_experts = (
+        narrow_experts // active_experts_per_rank * experts_per_rank
+        + narrow_experts % active_experts_per_rank
+    )
+    return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank)
+
+
+def make_balanced_routing_method(
+    apply_method_orig, num_experts, balance_method, balance_ratio, world_size, rank
+):
+    def balanced_routing_method(router_logits):
+        token_selected_experts, token_final_scales = apply_method_orig(router_logits)
+        if balance_method == BalanceMethod.NotModified:
+            pass
+        elif balance_method == BalanceMethod.Balanced:
+            token_selected_experts = get_balanced_selection(
+                token_selected_experts.shape[0],
+                token_selected_experts.shape[1],
+                num_experts,
+                token_selected_experts.dtype,
+                token_selected_experts.device,
+                world_size,
+                rank,
+            )
+        elif balance_method == BalanceMethod.ImbalancedRanks:
+            token_selected_experts = get_all_to_one_selection(
+                token_selected_experts.shape[0],
+                token_selected_experts.shape[1],
+                num_experts,
+                balance_ratio,
+                token_selected_experts.dtype,
+                token_selected_experts.device,
+                world_size,
+                rank,
+            )
+        elif balance_method == BalanceMethod.ImbalancedExperts:
+            token_selected_experts = get_balanced_rank_imbalanced_expert_selection(
+                token_selected_experts.shape[0],
+                token_selected_experts.shape[1],
+                num_experts,
+                balance_ratio,
+                token_selected_experts.dtype,
+                token_selected_experts.device,
+                world_size,
+                rank,
+            )
+        else:
+            raise NotImplementedError(f"Not support balance_method {balance_method}")
+        return token_selected_experts, token_final_scales
+
+    return balanced_routing_method
+
+
 class RunnerMixin(ABC):
     @staticmethod
     @abstractmethod
@@ -186,9 +347,44 @@ def run_pack():
 
         return run_pack
 
-    def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float):
-        if balance_method != BalanceMethod.NotModified:
-            raise NotImplementedError("not support replacing routing method for this runner")
+    @contextlib.contextmanager
+    def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float):
+        if balance_method == BalanceMethod.NotModified:
+            pass
+        elif self.model_config.moe_backend not in [
+            "CUTEDSL",
+            "CUTLASS",
+            "DEEPGEMM",
+            "TRTLLM",
+            "WIDEEP",
+        ]:
+            raise NotImplementedError(
+                f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",'
+                f' please set balance_method to "NotModified"'
+            )
+        elif (
+            self.model_config.moe_backend == "TRTLLM"
+            and not self.model_config.mapping.enable_attention_dp
+        ):
+            raise NotImplementedError(
+                'Not support replace routing method for moe_backend "TRTLLM" with attention TP,'
+                ' please set balance_method to "NotModified"'
+            )
+        apply_methods_orig = [layer.mlp.experts.routing_method.apply for layer in self.layers]
+        try:
+            for layer, apply_method_orig in zip(self.layers, apply_methods_orig):
+                layer.mlp.experts.routing_method.apply = make_balanced_routing_method(
+                    apply_method_orig,
+                    layer.mlp.experts.num_experts,
+                    balance_method,
+                    balance_ratio,
+                    layer.mlp.experts.ep_size,
+                    layer.mlp.experts.ep_rank,
+                )
+            yield
+        finally:
+            for layer, apply_method_orig in zip(self.layers, apply_methods_orig):
+                layer.mlp.experts.routing_method.apply = apply_method_orig
 
     @staticmethod
     def create_kv_cache_manager(
diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py
index bedd1cca795..56d73c08952 100644
--- a/tests/unittest/tools/test_layer_wise_benchmarks.py
+++ b/tests/unittest/tools/test_layer_wise_benchmarks.py
@@ -41,6 +41,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size):
             model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2",
             "--no-enable-attention-dp",
             "--moe-backend=TRTLLM",
+            "--balance-method=NotModified",
         ],
         cwd=llm_root / "examples" / "layer_wise_benchmarks",
         env={

From 23d004f2ba8bd9d65282c03b59b6c25be30c4222 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Fri, 21 Nov 2025 04:31:34 +0000
Subject: [PATCH 07/16] Update list option interface

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md | 16 ++++----
 examples/layer_wise_benchmarks/run.py    | 48 ++++++++++++------------
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index 19f634a73f7..d0ebcf081a8 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -109,19 +109,21 @@ SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.ya
 
 ### Batched run
 
-By specifying `--batch-size-list` on the command line (or `batch_size_list` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. When `--batch-size-list` is set, the value of `--batch-size` is ignored.
+By specifying a list for `--batch-size` on the command line (or `batch_size` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization.
 
 Supported list arguments:
-- `--batch-size-list` (or `batch_size_list` in YAML)
-- `--seq-len-q-list` (or `seq_len_q_list` in YAML)
-- `--seq-len-kv-cache-list` (or `seq_len_kv_cache_list` in YAML)
-- `--balance-ratio-list` (or `balance_ratio_list` in YAML)
+- `--batch-size` (or `batch_size` in YAML)
+- `--seq-len-q` (or `seq_len_q` in YAML)
+- `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML)
+- `--balance-ratio` (or `balance_ratio` in YAML)
+
+Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`.
 
 Run with OpenMPI:
 
 ```
-NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size-list 1,2,4 --seq-len-q-list 1024,8192
-NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size-list 32,64,128,256,512 --seq-len-q-list 1,2,3,4
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size 1,2,4 --seq-len-q 1024,8192
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size 32,64,128,256,512 --seq-len-q 1,2,3,4
 ```
 
 ## Parse profiles
diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index a1d51877047..6d152fb9a75 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -48,16 +48,11 @@ def comma_separated_floats(s):
 group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph")
 parser.set_defaults(use_cuda_graph=None)
 # Per iteration args
-parser.add_argument("--batch-size", type=int)
-parser.add_argument("--seq-len-q", type=int)
-parser.add_argument("--seq-len-kv-cache", type=int)
+parser.add_argument("--batch-size", type=comma_separated_ints, dest="batch_size_list")
+parser.add_argument("--seq-len-q", type=comma_separated_ints, dest="seq_len_q_list")
+parser.add_argument("--seq-len-kv-cache", type=comma_separated_ints, dest="seq_len_kv_cache_list")
 parser.add_argument("--balance-method", type=str)
-parser.add_argument("--balance-ratio", type=float)
-# Batched run args
-parser.add_argument("--batch-size-list", type=comma_separated_ints)
-parser.add_argument("--seq-len-q-list", type=comma_separated_ints)
-parser.add_argument("--seq-len-kv-cache-list", type=comma_separated_ints)
-parser.add_argument("--balance-ratio-list", type=comma_separated_floats)
+parser.add_argument("--balance-ratio", type=comma_separated_floats, dest="balance_ratio_list")
 # Schedule
 parser.add_argument("--warmup-times", type=int, default=20)
 parser.add_argument("--run-times", type=int, default=100)
@@ -67,21 +62,26 @@ def comma_separated_floats(s):
     config = yaml.safe_load(f)
 del args.config_path
 for k, v in vars(args).items():
-    if v is None and k in config:
-        setattr(args, k, config[k])
-# Set list arguments
-if args.batch_size_list is None:
-    args.batch_size_list = [args.batch_size]
-del args.batch_size
-if args.seq_len_q_list is None:
-    args.seq_len_q_list = [args.seq_len_q]
-del args.seq_len_q
-if args.seq_len_kv_cache_list is None:
-    args.seq_len_kv_cache_list = [args.seq_len_kv_cache]
-del args.seq_len_kv_cache
-if args.balance_ratio_list is None:
-    args.balance_ratio_list = [args.balance_ratio]
-del args.balance_ratio
+    if k.endswith("_list"):
+        config_key = k[: -len("_list")]
+        if v is None and config_key in config:
+            v = config[config_key]
+            if isinstance(v, list):
+                pass
+            elif v is None or isinstance(v, (int, float)):
+                v = [v]
+            else:
+                raise ValueError(f'Config "{config_key}" in YAML should be a value or a list')
+            setattr(args, k, v)
+    else:
+        config_key = k
+        if v is None and config_key in config:
+            v = config[config_key]
+            setattr(args, k, v)
+    if config_key in config:
+        del config[config_key]
+if config:
+    raise ValueError(f"Config {','.join(config.keys())} from file are not options")
 # Set default values
 if args.max_batch_size is None:
     args.max_batch_size = max(args.batch_size_list)

From 99fa5b0e19926f348c20e9bed329a7761989ab49 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Fri, 21 Nov 2025 13:22:24 +0000
Subject: [PATCH 08/16] Add a parser (WIP)

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/parse.py | 298 ++++++++++++++++++++++++
 examples/layer_wise_benchmarks/run.py   |  11 +
 examples/layer_wise_benchmarks/run.sh   |   4 +-
 3 files changed, 311 insertions(+), 2 deletions(-)
 create mode 100644 examples/layer_wise_benchmarks/parse.py

diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
new file mode 100644
index 00000000000..b00403c532c
--- /dev/null
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -0,0 +1,298 @@
+import argparse
+import bisect
+import json
+import re
+import sqlite3
+import subprocess
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+# Parse cmdline
+parser = argparse.ArgumentParser()
+parser.add_argument("--profile-dir", type=str, default="profiles")
+parser.add_argument("--world-size", type=int)
+parser.add_argument("--rank", type=int, default=0)
+group = parser.add_mutually_exclusive_group(required=False)
+group.add_argument("--unknown-kernel-as-error", action="store_true", dest="unknown_kernel_as_error")
+group.add_argument(
+    "--no-unknown-kernel-as-error", action="store_false", dest="unknown_kernel_as_error"
+)
+args = parser.parse_args()
+
+
+def lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path):
+    if (
+        not sqlite_file_path.is_file()
+        or nsys_rep_file_path.stat().st_mtime > sqlite_file_path.stat().st_mtime
+    ):
+        subprocess.check_call(
+            [
+                "nsys",
+                "export",
+                "--type",
+                "sqlite",
+                "-o",
+                sqlite_file_path,
+                "--force-overwrite=true",
+                nsys_rep_file_path,
+            ]
+        )
+
+
+def shortest_common_supersequence(a, b):
+    # Merge two lists into their shortest common supersequence,
+    # so that both `a` and `b` are subsequences of the result.
+    # Uses dynamic programming to compute the shortest common supersequence, then reconstructs it.
+    m, n = len(a), len(b)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(m + 1):
+        dp[i][0] = i
+    for j in range(n + 1):
+        dp[0][j] = j
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if a[i - 1] == b[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1)
+    # Backtrack to build the merged sequence
+    res = []
+    i, j = m, n
+    while i > 0 and j > 0:
+        if a[i - 1] == b[j - 1]:
+            res.append(a[i - 1])
+            i -= 1
+            j -= 1
+        elif dp[i - 1][j] < dp[i][j - 1]:
+            res.append(a[i - 1])
+            i -= 1
+        else:
+            res.append(b[j - 1])
+            j -= 1
+    while i > 0:
+        res.append(a[i - 1])
+        i -= 1
+    while j > 0:
+        res.append(b[j - 1])
+        j -= 1
+    res.reverse()
+    return res
+
+
+nsys_rep_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
+sqlite_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.sqlite"
+lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path)
+
+conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True)
+
+query = "SELECT * FROM ENUM_NSYS_EVENT_TYPE"
+df = pd.read_sql_query(query, conn)
+event_id_NvtxDomainCreate = df[df["name"] == "NvtxDomainCreate"].iloc[0]["id"].tolist()
+event_id_NvtxPushPopRange = df[df["name"] == "NvtxPushPopRange"].iloc[0]["id"].tolist()
+
+query = "SELECT domainId FROM NVTX_EVENTS WHERE eventType = ? AND text = ?"
+df = pd.read_sql_query(query, conn, params=(event_id_NvtxDomainCreate, "NCCL"))
+nccl_domain_id = -1 if df.empty else df.iloc[0]["domainId"].tolist()
+
+query = """SELECT T1.start, T2.value AS text
+    FROM NVTX_EVENTS AS T1
+    JOIN StringIds AS T2 ON T1.textId = T2.id
+    WHERE eventType = ? AND T2.value LIKE ?"""
+df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %"))
+problem_start = []
+problem_set = []
+for start, text in df.itertuples(index=False):
+    if text.startswith("layer_wise_benchmarks args {"):
+        run_args = json.loads(text[len("layer_wise_benchmarks args") :])
+    elif text.startswith("layer_wise_benchmarks problem_spec {"):
+        problem_start.append(start)
+        problem_set.append(
+            {
+                "spec": json.loads(text[len("layer_wise_benchmarks problem_spec") :]),
+                "text": "",
+                "runs": [],
+                "runs_end": [],
+                "ranges": [],
+            }
+        )
+
+query = """SELECT T1.start, T1.end, T2.value AS text
+    FROM NVTX_EVENTS AS T1
+    JOIN StringIds AS T2 ON T1.textId = T2.id
+    WHERE eventType = ? AND T2.value NOT LIKE ? AND domainId != ?"""
+df = pd.read_sql_query(
+    query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", nccl_domain_id)
+)
+for start, end, text in df.itertuples(index=False):
+    problem_id = bisect.bisect(problem_start, start) - 1
+    if re.match(r"b=\d+ s=\d+ ", text):
+        problem_set[problem_id]["text"] = text
+        problem_set[problem_id]["runs"].append(start)
+        problem_set[problem_id]["runs_end"].append(end)
+    else:
+        problem_set[problem_id]["ranges"].append((start, end, text))
+
+query = """SELECT unified.start, unified.end, unified.demangledName,
+       R.start AS runtime_start, R.end AS runtime_end,
+       CGE2.start AS capture_start, CGE2.end AS capture_end
+FROM (
+    SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId
+    FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1
+    UNION ALL
+    SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId
+    FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2
+    UNION ALL
+    SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
+    FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3
+) AS unified
+JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
+LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND
+                                            CGE1.originalGraphNodeId IS NOT NULL
+LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId"""
+df = pd.read_sql_query(query, conn)
+kernel_list = []
+for (
+    start,
+    end,
+    demangledName,
+    runtime_start,
+    runtime_end,
+    capture_start,
+    capture_end,
+) in df.itertuples(index=False):
+    problem_id = bisect.bisect(problem_start, start) - 1
+    run_id = bisect.bisect(problem_set[problem_id]["runs"], runtime_start) - 1
+    if (
+        run_id == -1
+        or run_id == len(problem_set[problem_id]["runs"])
+        or runtime_start >= problem_set[problem_id]["runs_end"][run_id]
+    ):
+        run_id = -1
+    ranges = [
+        text
+        for range_start, range_end, text in problem_set[problem_id]["ranges"]
+        if capture_start >= range_start and capture_end <= range_end
+    ]
+    kernel_list.append(
+        (
+            problem_id,
+            run_id,
+            ranges,
+            start,
+            end,
+            demangledName,
+            runtime_start,
+            runtime_end,
+            capture_start,
+            capture_end,
+        )
+    )
+
+query = "SELECT * FROM StringIds"
+df = pd.read_sql_query(query, conn)
+string_ids = dict(zip(df["id"], df["value"]))
+
+conn.close()
+
+kernel_list.sort(key=lambda t: (t[6], t[8]))
+kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
+for (
+    problem_id,
+    run_id,
+    ranges,
+    start,
+    end,
+    demangledName,
+    runtime_start,
+    runtime_end,
+    capture_start,
+    capture_end,
+) in kernel_list:
+    if run_id != -1:
+        kernels[problem_id][run_id].append((demangledName, start, end, ranges))
+for problem_id in range(len(kernels)):
+    required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
+    for run_id in range(len(kernels[problem_id])):
+        seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]]
+        assert seq == required_seq
+
+
+parser_keywords = [
+    ("Gemm", "nvjet"),
+    ("RMSNorm", "RMSNormKernel"),
+    ("Cat", "CatArrayBatchedCopy"),
+    ("RoPE", "applyMLARope"),
+    ("fmha", "fmhaSm100fKernel_Qkv"),
+    ("fmhaReduction", "fmhaReductionKernel"),
+    ("quant", "quantize_with_block_size"),
+    ("expandInput", "expandInputRowsKernel"),
+    ("computeStrides", "computeStridesTmaWarpSpecializedKernel"),
+    ("GroupGemm", "cutlass::device_kernel<cutlass::gemm::kernel::GemmUniversal"),
+    ("doActivation", "doActivationKernel"),
+    ("Gemm", "GemmUniversal"),
+    ("splitKreduce", "splitKreduce_kernel"),
+    ("topk", "deepseek_v3_topk_kernel"),
+    ("AllGather", "ncclDevKernel_AllGather_"),
+    ("ReduceScatter", "ncclDevKernel_ReduceScatter_"),
+    ("CountAndIndice", "computeCountAndIndiceDevice"),
+    ("Cumsum", "computeCumsumDevice"),
+    ("moveIndice", "moveIndiceDevice"),
+    ("AllToAll", "moeAllToAllKernel"),
+    ("memsetExpertIds", "memsetExpertIdsDevice"),
+    ("blockSum", "blockExpertPrefixSumKernel"),
+    ("globalSum", "globalExpertPrefixSumKernel"),
+    ("mergePrefix", "mergeExpertPrefixSumKernel"),
+    ("fusedBuildExpertMaps", "fusedBuildExpertMapsSortFirstTokenKernel"),
+    ("swiglu", "silu_and_mul_kernel"),
+    ("add", "CUDAFunctor_add"),
+    ("Fill", "at::native::FillFunctor"),
+    ("red_fused_add_sum", "triton_red_fused_add_sum_0"),
+]
+warned_names = set()
+
+
+def parse_kernel_name(demangledName):
+    if demangledName == -2:
+        return "Memcpy"
+    if demangledName == -3:
+        return "Memset"
+    name = string_ids[demangledName]
+    for dst, src in parser_keywords:
+        if src in name:
+            return dst
+    if name not in warned_names:
+        print(f"Unknown kernel name: {name}")
+        warned_names.add(name)
+        if args.unknown_kernel_as_error:
+            raise NotImplementedError(f"Unknown kernel name: {name}")
+    return name[:20]
+
+
+converted_seqs = []
+for runs in kernels:
+    converted_seq = []
+    for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
+        # TODO: Group by ranges
+        name = parse_kernel_name(demangledName)
+        if ranges:
+            name = f"{ranges[0]}.{name}"
+        time_list = [run[i][2] - run[i][1] for run in runs]
+        time_list = time_list[run_args["warmup_times"] :]
+        time = np.mean(time_list).tolist()
+        converted_seq.append((name, time))
+    converted_seqs.append(converted_seq)
+
+merged_title = []
+for converted_seq in converted_seqs:
+    title = [name for name, _ in converted_seq]
+    merged_title = shortest_common_supersequence(merged_title, title)
+print(merged_title)
+
+print("Problem set:")
+for problem in problem_set:
+    print(
+        f'- "{problem["text"]}"    {len(problem["runs"])} runs'
+        f"    Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]"
+    )
diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index 6d152fb9a75..b2bcf1d70ae 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -1,5 +1,6 @@
 import argparse
 import itertools
+import json
 
 import numpy as np
 import nvtx
@@ -170,10 +171,20 @@ def comma_separated_floats(s):
 [e.record() for e in events]  # Explicitly warmup events because torch is lazy
 
 torch.cuda.cudart().cudaProfilerStart()
+with nvtx.annotate(f"layer_wise_benchmarks args {json.dumps(args.__dict__)}"):
+    pass  # Use `annotate` instead of `mark` to avoid addition lines on the Nsight Systems UI
 for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product(
     args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list
 ):
     # Profile: capture graph and replay it
+    problem_spec = {
+        "batch_size": batch_size,
+        "seq_len_q": seq_len_q,
+        "seq_len_kv_cache": seq_len_kv_cache,
+        "balance_ratio": balance_ratio,
+    }
+    with nvtx.annotate(f"layer_wise_benchmarks problem_spec {json.dumps(problem_spec)}"):
+        pass
     run_pack = runner.create_run_pack(
         args.run_type,
         batch_size=batch_size,
diff --git a/examples/layer_wise_benchmarks/run.sh b/examples/layer_wise_benchmarks/run.sh
index 7d26041e970..3c45da1324a 100755
--- a/examples/layer_wise_benchmarks/run.sh
+++ b/examples/layer_wise_benchmarks/run.sh
@@ -24,7 +24,7 @@ if [ "$PROFILE" -eq 1 ]; then
         --cpuctxsw none --cuda-event-trace false
         --cuda-graph-trace node
         -c cudaProfilerApi --capture-range-end stop
-        -o ${PROFILE_DIR}/report_ep${WORLD_SIZE}_rank${RANK}.nsys-rep
+        -o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep
         --force-overwrite true"
     if [ "$GPU_METRICS" -eq 1 ]; then
         PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK
@@ -36,5 +36,5 @@ fi
 
 set -x
 $PROFILE_CMD bash -c \
-    "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_ep${WORLD_SIZE}_rank${RANK}.log\"" \
+    "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_np${WORLD_SIZE}_rank${RANK}.log\"" \
     bash "$@"

From 6a9ce27cee612719011b712eaac22df540fa923c Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Mon, 24 Nov 2025 13:32:58 +0000
Subject: [PATCH 09/16] Add HTML, support more models

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 .gitignore                                    |   1 +
 examples/layer_wise_benchmarks/parse.py       | 173 ++++-
 examples/layer_wise_benchmarks/run.py         |   5 +-
 examples/layer_wise_benchmarks/template.html  | 714 ++++++++++++++++++
 .../tools/layer_wise_benchmarks/__init__.py   |   6 +-
 .../tools/layer_wise_benchmarks/mark_utils.py |  24 +
 6 files changed, 879 insertions(+), 44 deletions(-)
 create mode 100644 examples/layer_wise_benchmarks/template.html
 create mode 100644 tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py

diff --git a/.gitignore b/.gitignore
index e2e8a6e58ef..78d8da20e47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,6 +73,7 @@ cpp/include/tensorrt_llm/executor/version.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
 .devcontainer/.env
+/examples/layer_wise_benchmarks/profiles/
 
 # User config files
 CMakeUserPresets.json
diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index b00403c532c..e979c6105e6 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -6,20 +6,23 @@
 import subprocess
 from pathlib import Path
 
+import jinja2
 import numpy as np
 import pandas as pd
 
 # Parse cmdline
 parser = argparse.ArgumentParser()
 parser.add_argument("--profile-dir", type=str, default="profiles")
-parser.add_argument("--world-size", type=int)
+parser.add_argument("--world-size", "--np", type=int)
 parser.add_argument("--rank", type=int, default=0)
 group = parser.add_mutually_exclusive_group(required=False)
-group.add_argument("--unknown-kernel-as-error", action="store_true", dest="unknown_kernel_as_error")
+group.add_argument("--error-on-unknown-kernel", action="store_true", dest="error_on_unknown_kernel")
 group.add_argument(
-    "--no-unknown-kernel-as-error", action="store_false", dest="unknown_kernel_as_error"
+    "--no-error-on-unknown-kernel", action="store_false", dest="error_on_unknown_kernel"
 )
+parser.set_defaults(error_on_unknown_kernel=None)
 args = parser.parse_args()
+print(args)
 
 
 def lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path):
@@ -81,8 +84,10 @@ def shortest_common_supersequence(a, b):
     return res
 
 
-nsys_rep_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
-sqlite_file_path = Path(args.profile_dir) / f"report_np{args.world_size}_rank{args.rank}.sqlite"
+profile_dir = Path(args.profile_dir)
+nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
+sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite"
+html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html"
 lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path)
 
 conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True)
@@ -121,12 +126,15 @@ def shortest_common_supersequence(a, b):
 query = """SELECT T1.start, T1.end, T2.value AS text
     FROM NVTX_EVENTS AS T1
     JOIN StringIds AS T2 ON T1.textId = T2.id
-    WHERE eventType = ? AND T2.value NOT LIKE ? AND domainId != ?"""
+    WHERE eventType = ? AND T2.value NOT LIKE ? AND T2.value NOT LIKE ? AND domainId != ?"""
 df = pd.read_sql_query(
-    query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", nccl_domain_id)
+    query,
+    conn,
+    params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", "[DG]%", nccl_domain_id),
 )
 for start, end, text in df.itertuples(index=False):
     problem_id = bisect.bisect(problem_start, start) - 1
+    assert problem_id != -1
     if re.match(r"b=\d+ s=\d+ ", text):
         problem_set[problem_id]["text"] = text
         problem_set[problem_id]["runs"].append(start)
@@ -134,19 +142,23 @@ def shortest_common_supersequence(a, b):
     else:
         problem_set[problem_id]["ranges"].append((start, end, text))
 
-query = """SELECT unified.start, unified.end, unified.demangledName,
+query = """SELECT name FROM sqlite_master WHERE type = ?"""
+df = pd.read_sql_query(query, conn, params=("table",))
+tables = df["name"].tolist()
+unified_subquery = """SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId
+    FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1"""
+if "CUPTI_ACTIVITY_KIND_MEMCPY" in tables:
+    unified_subquery += """ UNION ALL
+        SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId
+        FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2"""
+if "CUPTI_ACTIVITY_KIND_MEMSET" in tables:
+    unified_subquery += """ UNION ALL
+        SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
+        FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3"""
+query = f"""SELECT unified.start, unified.end, unified.demangledName,
        R.start AS runtime_start, R.end AS runtime_end,
        CGE2.start AS capture_start, CGE2.end AS capture_end
-FROM (
-    SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId
-    FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1
-    UNION ALL
-    SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId
-    FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2
-    UNION ALL
-    SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId
-    FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3
-) AS unified
+FROM ({unified_subquery}) AS unified
 JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
 LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND
                                             CGE1.originalGraphNodeId IS NOT NULL
@@ -189,6 +201,7 @@ def shortest_common_supersequence(a, b):
             capture_end,
         )
     )
+# TODO: Parse CTX phases
 
 query = "SELECT * FROM StringIds"
 df = pd.read_sql_query(query, conn)
@@ -220,22 +233,25 @@ def shortest_common_supersequence(a, b):
 
 
 parser_keywords = [
-    ("Gemm", "nvjet"),
+    ("cuBLASGemm", "nvjet"),
+    ("splitKreduce", "splitKreduce_kernel"),
+    ("fusedAGemm", "fused_a_gemm_kernel"),
     ("RMSNorm", "RMSNormKernel"),
-    ("Cat", "CatArrayBatchedCopy"),
-    ("RoPE", "applyMLARope"),
-    ("fmha", "fmhaSm100fKernel_Qkv"),
+    ("torchCat", "CatArrayBatchedCopy"),
+    ("applyMLARope", "applyMLARope"),
+    ("fmhaSm100f", "fmhaSm100fKernel_Qkv"),
     ("fmhaReduction", "fmhaReductionKernel"),
     ("quant", "quantize_with_block_size"),
+    ("AllGather", "ncclDevKernel_AllGather_"),
+    ("ReduceScatter", "ncclDevKernel_ReduceScatter_"),
+    ("allreduce_oneshot", "allreduce_fusion_kernel_oneshot_lamport"),
+    ("allreduce_twoshot", "allreduce_fusion_kernel_twoshot_sync"),
     ("expandInput", "expandInputRowsKernel"),
     ("computeStrides", "computeStridesTmaWarpSpecializedKernel"),
-    ("GroupGemm", "cutlass::device_kernel<cutlass::gemm::kernel::GemmUniversal"),
+    ("cutlassGroupGemm", "cutlass::device_kernel<cutlass::gemm::kernel::GemmUniversal"),
     ("doActivation", "doActivationKernel"),
-    ("Gemm", "GemmUniversal"),
-    ("splitKreduce", "splitKreduce_kernel"),
-    ("topk", "deepseek_v3_topk_kernel"),
-    ("AllGather", "ncclDevKernel_AllGather_"),
-    ("ReduceScatter", "ncclDevKernel_ReduceScatter_"),
+    ("cutlassGemm", "GemmUniversal"),
+    ("deepseek_v3_topk", "deepseek_v3_topk_kernel"),
     ("CountAndIndice", "computeCountAndIndiceDevice"),
     ("Cumsum", "computeCumsumDevice"),
     ("moveIndice", "moveIndiceDevice"),
@@ -246,9 +262,45 @@ def shortest_common_supersequence(a, b):
     ("mergePrefix", "mergeExpertPrefixSumKernel"),
     ("fusedBuildExpertMaps", "fusedBuildExpertMapsSortFirstTokenKernel"),
     ("swiglu", "silu_and_mul_kernel"),
-    ("add", "CUDAFunctor_add"),
-    ("Fill", "at::native::FillFunctor"),
-    ("red_fused_add_sum", "triton_red_fused_add_sum_0"),
+    ("torchAdd", "CUDAFunctor_add"),
+    ("torchFill", "at::native::FillFunctor"),
+    ("triton_fused_add_sum", "triton_red_fused_add_sum_0"),
+    ("torchCopy", "at::native::bfloat16_copy_kernel_cuda"),
+    ("torchDistribution", "distribution_elementwise_grid_stride_kernel"),
+    ("torchArange", "at::native::arange_cuda_out"),
+    ("torchDirectCopy", "at::native::direct_copy_kernel_cuda"),
+    ("torchBitonicSort", "at::native::bitonicSortKVInPlace"),
+    ("routingInitExpertCounts", "routingInitExpertCounts"),
+    ("routingIndicesCluster", "routingIndicesClusterKernel"),
+    ("routingIndicesCoop", "routingIndicesCoopKernel"),
+    ("bmm_4_44_32", "bmm_E2m1_E2m1E2m1_Fp32_t"),
+    ("finalize", "finalize::finalizeKernel"),
+    ("bmm_16_44_32", "bmm_Bfloat16_E2m1E2m1_Fp32_"),
+    ("deep_gemm_gemm", "deep_gemm::sm100_fp8_gemm_1d1d_impl<"),
+    ("per_token_quant", "_per_token_quant_and_transform_kernel"),
+    ("triton_fused_layer_norm", "triton_per_fused__to_copy_native_layer_norm_0"),
+    ("flashinferRoPE", "flashinfer::BatchQKApplyRotaryPosIdsCosSinCacheHeadParallelismKernel<"),
+    ("fp8_blockscale_gemm", "tensorrt_llm::kernels::fp8_blockscale_gemm"),
+    ("triton_fused_mul_squeeze", "triton_poi_fused_mul_squeeze_0"),
+    ("indexerKCacheScatter", "tensorrt_llm::kernels::indexerKCacheScatterUnifiedKernel"),
+    ("deep_gemm_mqa_logits", "deep_gemm::sm100_fp8_paged_mqa_logits<"),
+    ("topKPerRowDecode", "tensorrt_llm::kernels::topKPerRowDecode<"),
+    ("torchAdd<int>", "at::native::CUDAFunctorOnSelf_add"),
+    ("convert_req_index", "_convert_req_index_to_global_index_kernel_with_stride_factor"),
+    ("preprocess_after_permute", "_preprocess_after_permute_kernel"),
+    ("masked_index_copy_quant", "_masked_index_copy_group_quant_fp8"),
+    ("swiglu_quant", "_silu_and_mul_post_quant_kernel"),
+    ("masked_index_gather", "masked_index_gather_kernel"),
+    ("finalizeMoeRouting", "tensorrt_llm::kernels::cutlass_kernels::finalizeMoeRoutingKernel<"),
+    ("fused_qkvzba_split", "fused_qkvzba_split_reshape_cat_kernel"),
+    ("causal_conv1d_update", "tensorrt_llm::kernels::causal_conv1d::causal_conv1d_update_kernel<"),
+    ("fused_delta_rule_update", "fused_sigmoid_gating_delta_rule_update_kernel"),
+    ("layer_norm_fwd_1pass", "_layer_norm_fwd_1pass_kernel"),
+    ("torchGatherTopK", "at::native::sbtopk::gatherTopK<"),
+    ("softmax_warp_forward", "softmax_warp_forward<"),
+    ("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
+    ("torchMul", "at::native::binary_internal::MulFunctor<"),
+    ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"),
 ]
 warned_names = set()
 
@@ -265,7 +317,7 @@ def parse_kernel_name(demangledName):
     if name not in warned_names:
         print(f"Unknown kernel name: {name}")
         warned_names.add(name)
-        if args.unknown_kernel_as_error:
+        if args.error_on_unknown_kernel:
             raise NotImplementedError(f"Unknown kernel name: {name}")
     return name[:20]
 
@@ -274,21 +326,26 @@ def parse_kernel_name(demangledName):
 for runs in kernels:
     converted_seq = []
     for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
-        # TODO: Group by ranges
         name = parse_kernel_name(demangledName)
-        if ranges:
-            name = f"{ranges[0]}.{name}"
+        category = (*ranges, name)
         time_list = [run[i][2] - run[i][1] for run in runs]
         time_list = time_list[run_args["warmup_times"] :]
-        time = np.mean(time_list).tolist()
-        converted_seq.append((name, time))
+        t = np.mean(time_list).tolist()
+        converted_seq.append((category, t))
     converted_seqs.append(converted_seq)
 
 merged_title = []
 for converted_seq in converted_seqs:
     title = [name for name, _ in converted_seq]
     merged_title = shortest_common_supersequence(merged_title, title)
-print(merged_title)
+
+merged_data = [[0.0] * len(problem_set) for _ in merged_title]
+for problem_id, converted_seq in enumerate(converted_seqs):
+    cur = 0
+    for category, t in converted_seq:
+        cur = merged_title.index(category, cur)
+        merged_data[cur][problem_id] = t
+        cur += 1
 
 print("Problem set:")
 for problem in problem_set:
@@ -296,3 +353,43 @@ def parse_kernel_name(demangledName):
         f'- "{problem["text"]}"    {len(problem["runs"])} runs'
         f"    Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]"
     )
+
+stack = []
+js_data = []
+js_stack = [js_data]
+max_title_len = max((len(title) - 1) * 3 + len(title[-1]) for title in merged_title)
+for title, time_data in zip(merged_title, merged_data):
+    while stack != list(title[: len(stack)]):
+        level_title = stack[-1]
+        stack.pop()
+        js_stack[-2].append(
+            {
+                "name": level_title,
+                "children": js_stack[-1],
+            }
+        )
+        js_stack.pop()
+    while len(stack) != len(title) - 1:
+        level_title = title[len(stack)]
+        stack.append(level_title)
+        level = len(stack)
+        print("|--" * (level - 1) + level_title)
+        js_stack.append([])
+    level = len(stack) + 1
+    print(
+        "|--" * (level - 1) + title[-1] + " " * (max_title_len - (level - 1) * 3 - len(title[-1])),
+        *[f"{x / 1000:-6.2f}" for x in time_data],
+    )
+    js_stack[-1].append(
+        {
+            "name": title[-1],
+            "time": [x / 1000 for x in time_data],
+        }
+    )
+# TODO: Print overlap and space
+# TODO: Statistics
+js_header_config = [{"name": problem["text"]} for problem in problem_set]
+loader = jinja2.FileSystemLoader(Path(__file__).parent)
+template = jinja2.Environment(loader=loader).get_template("template.html")
+with html_file_path.open("w") as f:
+    f.write(template.render(headerConfig=js_header_config, rawData=js_data))
diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py
index b2bcf1d70ae..6a074cb67d9 100644
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@@ -10,7 +10,7 @@
 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
 from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
-from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls
+from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges
 
 
 def comma_separated_ints(s):
@@ -97,6 +97,7 @@ def comma_separated_floats(s):
 torch.cuda.set_device(local_rank)
 
 # Create KV cache manager
+mark_ranges()
 Runner = get_runner_cls(args.model)
 mapping = Runner.create_mapping(enable_attention_dp=args.enable_attention_dp)
 kv_cache_manager = Runner.create_kv_cache_manager(
@@ -203,7 +204,7 @@ def comma_separated_floats(s):
                     run_pack()
 
         balance_ratio_str = "" if balance_ratio is None else f"  balance={balance_ratio:.2g}"
-        nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} EP{world_size}"
+        nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} NP{world_size}"
         for i in range(args.warmup_times + args.run_times):
             events[i].record()
             with nvtx.annotate(nvtx_message):
diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html
new file mode 100644
index 00000000000..43f1d915a02
--- /dev/null
+++ b/examples/layer_wise_benchmarks/template.html
@@ -0,0 +1,714 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>CUDA Kernel Latency Dashboard</title>
+    <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
+    <style>
+        :root {
+            --bg-color: #1e1e1e;
+            --text-color: #d4d4d4;
+            --border-color: #3e3e42;
+            --header-bg: #252526;
+            --parent-row-bg: #2d2d30;
+            --leaf-row-bg: #1e1e1e;
+            --total-row-bg: #1a3c4d;
+            --accent-color: #007acc;
+
+            /* Selection Colors */
+            --row-hover-bg: #2a2d2e;
+            --row-select-bg: rgba(55, 55, 61, 0.8); /* Base row select */
+            --col-select-bg: rgba(0, 122, 204, 0.15); /* Column highlight */
+            --cell-select-bg: rgba(0, 122, 204, 0.4); /* Intersection */
+
+            --progress-bar-bg: rgba(0, 255, 200, 0.12);
+        }
+
+        body {
+            font-family: "Segoe UI", -apple-system, Roboto, Helvetica, Arial, sans-serif;
+            background-color: var(--bg-color);
+            color: var(--text-color);
+            margin: 0;
+            padding: 15px;
+            height: 100vh;
+            display: flex;
+            flex-direction: column;
+            box-sizing: border-box;
+        }
+
+        /* Layout */
+        .toolbar {
+            margin-bottom: 10px;
+            display: flex;
+            gap: 10px;
+            align-items: center;
+            flex-shrink: 0;
+        }
+
+        .dashboard-grid {
+            display: grid;
+            grid-template-columns: 1fr 450px;
+            gap: 15px;
+            flex: 1;
+            overflow: hidden;
+        }
+
+        .table-panel {
+            border: 1px solid var(--border-color);
+            overflow: auto;
+            position: relative;
+            background: var(--bg-color);
+        }
+
+        .charts-panel {
+            display: flex;
+            flex-direction: column;
+            gap: 10px;
+            overflow: hidden;
+        }
+
+        .chart-container {
+            flex: 1;
+            background: var(--header-bg);
+            border: 1px solid var(--border-color);
+            border-radius: 4px;
+            padding: 10px;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .chart-header {
+            font-size: 12px;
+            font-weight: bold;
+            color: #aaa;
+            margin-bottom: 5px;
+            border-bottom: 1px solid #444;
+            padding-bottom: 5px;
+        }
+
+        .chart-body {
+            flex: 1;
+            width: 100%;
+            min-height: 0;
+        }
+
+        button {
+            background-color: #333;
+            color: white;
+            border: 1px solid #555;
+            padding: 6px 12px;
+            cursor: pointer;
+            border-radius: 3px;
+            font-size: 12px;
+        }
+        button:hover { background-color: #444; border-color: var(--accent-color); }
+
+        /* --- Table Styles (Fixed Layout) --- */
+        table {
+            width: 100%;
+            border-collapse: separate;
+            border-spacing: 0;
+            font-size: 13px;
+            min-width: 1000px;
+            table-layout: fixed; /* Crucial for equal widths */
+        }
+
+        th, td {
+            border-right: 1px solid var(--border-color);
+            border-bottom: 1px solid var(--border-color);
+            white-space: nowrap;
+            cursor: pointer;
+            position: relative;
+            height: 28px;
+            padding: 0;
+            overflow: hidden;
+        }
+
+        /* --- Column Width Control --- */
+        /* 1. First Column (Name) fixed width */
+        th:first-child, td:first-child {
+            width: 220px;
+            min-width: 220px;
+            max-width: 220px;
+            position: sticky;
+            left: 0;
+            z-index: 30;
+            border-left: 1px solid var(--border-color);
+        }
+
+        /* 2. All other columns share remaining space equally */
+        th:not(:first-child), td:not(:first-child) {
+            min-width: 80px;
+        }
+
+        /* Cell Internal Layout */
+        .cell-content {
+            position: relative;
+            z-index: 2;
+            padding: 6px 8px;
+            display: block;
+            width: 100%;
+            height: 100%;
+            box-sizing: border-box;
+            overflow: hidden;
+            text-overflow: ellipsis;
+        }
+
+        .progress-bar {
+            position: absolute;
+            top: 2px; bottom: 2px; right: 2px;
+            background-color: var(--progress-bar-bg);
+            z-index: 1;
+            pointer-events: none;
+        }
+
+        /* Sticky Header */
+        thead { position: sticky; top: 0; z-index: 20; }
+        th {
+            background-color: var(--header-bg);
+            font-weight: 600;
+            text-align: center;
+            border-top: 1px solid var(--border-color);
+            padding: 6px;
+        }
+        thead tr:first-child th:first-child { z-index: 40; }
+
+        /* Row Colors */
+        tr.row-parent td { background-color: var(--parent-row-bg); color: #eee; font-weight: 600; }
+        tr.row-leaf td { background-color: var(--leaf-row-bg); color: #ccc; }
+        tr.row-total td { background-color: var(--total-row-bg); color: #fff; font-weight: bold; border-top: 2px solid #666; }
+
+        /* Sticky col bg fix */
+        tr.row-parent td:first-child { background-color: var(--parent-row-bg); }
+        tr.row-leaf td:first-child { background-color: var(--leaf-row-bg); }
+        tr.row-total td:first-child { background-color: var(--total-row-bg); }
+
+        /* --- Highlighting Logic --- */
+
+        /* 1. Row Selected (Applied to TR) */
+        tr.row-selected td {
+            background-color: var(--row-select-bg) !important; /* Overrides default row color */
+            border-top: 1px solid var(--accent-color);
+            border-bottom: 1px solid var(--accent-color);
+        }
+
+        /* 2. Column Selected (Applied to TD via class) */
+        td.col-selected {
+            background-color: var(--col-select-bg);
+        }
+
+        /* IMPORTANT: Force sticky column to NOT take column color unless it's the intersection (handled below) */
+        td:first-child.col-selected {
+             background-color: inherit;
+        }
+
+        /* 3. Intersection (The specific cell) */
+        tr.row-selected td.col-selected {
+            background-color: var(--cell-select-bg) !important;
+            color: #fff;
+        }
+
+        /* Data Alignment */
+        td.data-cell .cell-content, tr.row-total td .cell-content {
+            text-align: right;
+            font-family: 'Consolas', monospace;
+        }
+
+        .toggle-icon {
+            display: inline-block; width: 16px; text-align: center; margin-right: 5px;
+            transition: transform 0.2s;
+        }
+        .collapsed .toggle-icon { transform: rotate(-90deg); }
+        .hidden { display: none; }
+
+    </style>
+</head>
+<body>
+
+<div class="toolbar">
+    <strong>Controls:</strong>
+    <button onclick="collapseAll()">Collapse All</button>
+    <span id="level-buttons"></span>
+    <button onclick="expandAll()">Expand All</button>
+    <span style="font-size: 12px; color: #666; margin-left: 10px;">(Tip: Arrows to move, +/- to toggle, Click charts to select)</span>
+</div>
+
+<div class="dashboard-grid">
+    <div class="table-panel">
+        <table id="kernelTable">
+            <thead id="tableHead"></thead>
+            <tbody id="tableBody"></tbody>
+            <tfoot id="tableFoot"></tfoot>
+        </table>
+    </div>
+
+    <div class="charts-panel">
+        <div class="chart-container">
+            <div class="chart-header" id="barTitle">Row Analysis</div>
+            <div id="barChart" class="chart-body"></div>
+        </div>
+        <div class="chart-container">
+            <div class="chart-header" id="sunburstTitle">Column Hierarchy</div>
+            <div id="sunburstChart" class="chart-body"></div>
+        </div>
+    </div>
+</div>
+
+<script>
+    // --- 1. Data & Config ---
+    const headerConfig = {{ headerConfig }};
+
+    const rawData = {{ rawData }};
+
+    // --- 2. State & Utils ---
+    let activeRowId = 'row_0';
+    let activeRowNode = rawData[0];
+    let activeColIndex = 0;
+    let barChartInst = null;
+    let sunburstChartInst = null;
+    let headerHeight = null;
+    const columnLabels = [];
+    let columnTotals = [];
+    let totalNode = null; // Special node for the "Total" row
+
+    const fmt = (num) => num.toFixed(1);
+
+    function processData(nodes) {
+        function aggregate(node) {
+            if (node.time) return node.time;
+            if (node.children) {
+                const childrenTimes = node.children.map(child => aggregate(child));
+                const summedTime = childrenTimes[0].map((_, colIndex) =>
+                    childrenTimes.reduce((acc, currArr) => acc + currArr[colIndex], 0)
+                );
+                node.calculatedTime = summedTime;
+                return summedTime;
+            }
+            throw new Error("Invalid node format!");
+        }
+        const topLevelTimes = rawData.map(aggregate);
+
+        // Calculate Totals
+        const numCols = topLevelTimes[0].length;
+        columnTotals = new Array(numCols).fill(0);
+        for (let c = 0; c < numCols; c++) {
+            columnTotals[c] = topLevelTimes.reduce((sum, row) => sum + row[c], 0);
+        }
+
+        // Create Total Node for Chart
+        totalNode = {
+            name: "Total",
+            calculatedTime: columnTotals,
+            children: [] // Total doesn't have children for breakdown in this context
+        };
+    }
+    processData(rawData);
+
+    function generateColumnLabels(node, depth = 0, prefix = "") {
+        node.forEach(item => {
+            const newPrefix = prefix ? `${prefix} ${item.name}` : item.name;
+            if (item.children) {
+                generateColumnLabels(item.children, depth + 1, newPrefix);
+            } else {
+                columnLabels.push(prefix ? `${prefix}\n${item.name}` : item.name);
+                headerHeight = Math.max(headerHeight, depth);
+            }
+        });
+    }
+    generateColumnLabels(headerConfig);
+
+    // --- 3. Rendering ---
+    function renderHeader() {
+        const thead = document.getElementById('tableHead');
+        const totalRows = headerHeight + 1;
+        const rows = Array.from({ length: headerHeight + 1 }, () => []);
+        rows[0].push({ name: "Kernel Name", colspan: 1, rowspan: headerHeight + 1 });
+
+        function getColSpan(node) {
+            if (!node.children || node.children.length === 0) return 1;
+            return node.children.reduce((sum, child) => sum + getColSpan(child), 0);
+        }
+        function traverse(node, level) {
+            if (level >= totalRows) return;
+            node.forEach(child => {
+                const isLeaf = !child.children || child.children.length === 0;
+                const colspan = getColSpan(child);
+                const rowspan = isLeaf ? (totalRows - level) : 1;
+                rows[level].push({ name: child.name, colspan, rowspan });
+                if (!isLeaf) traverse(child.children, level + 1);
+            });
+        }
+        traverse(headerConfig, 0);
+
+        let html = '';
+        rows.forEach(row => {
+            html += '<tr>';
+            row.forEach(cell => {
+                html += `<th colspan="${cell.colspan}" rowspan="${cell.rowspan}" title="${cell.name}">${cell.name}</th>`;
+            });
+            html += '</tr>';
+        });
+        thead.innerHTML = html;
+    }
+
+    let rowIdMap = new Map();
+
+    function renderTableBody(nodes, level, parentId = null) {
+        let html = '';
+        nodes.forEach((node, index) => {
+            const uniqueId = parentId ? `${parentId}_${index}` : `row_${index}`;
+            rowIdMap.set(uniqueId, node);
+
+            const isLeaf = !node.children || node.children.length === 0;
+            const timeData = node.time || node.calculatedTime;
+            const parentClass = parentId ? `child-of-${parentId}` : '';
+            const rowType = isLeaf ? 'row-leaf' : 'row-parent';
+            const toggleIcon = isLeaf ? `<span class="toggle-icon" style="opacity:0">🔹</span>` : `<span class="toggle-icon">▼</span>`;
+
+            html += `<tr id="${uniqueId}" class="${parentClass} ${rowType}" data-level="${level}">`;
+
+            // Name Col
+            html += `<td onclick="toggleFold('${uniqueId}', event)" style="padding-left: ${10 + level * 20}px">
+                        <div class="cell-content">${toggleIcon} ${node.name}</div>
+                     </td>`;
+
+            // Data Cols
+            timeData.forEach((val, colIdx) => {
+                const total = columnTotals[colIdx] || 1;
+                const pct = (val / total) * 100;
+
+                html += `<td class="data-cell col-${colIdx}" onclick="handleCellClick('${uniqueId}', ${colIdx})">
+                            <div class="progress-bar" style="width: ${pct}%;"></div>
+                            <div class="cell-content">${fmt(val)}</div>
+                         </td>`;
+            });
+            html += `</tr>`;
+
+            if (!isLeaf) html += renderTableBody(node.children, level + 1, uniqueId);
+        });
+        return html;
+    }
+
+    function renderTotalRow() {
+        const tfoot = document.getElementById('tableFoot');
+        const uniqueId = 'row_total';
+        rowIdMap.set(uniqueId, totalNode);
+
+        let html = `<tr id="${uniqueId}" class="row-total">`;
+        html += `<td><div class="cell-content" style="padding-left: 10px;">Total</div></td>`;
+
+        columnTotals.forEach((val, colIdx) => {
+            html += `<td class="col-${colIdx}" onclick="handleCellClick('${uniqueId}', ${colIdx})">
+                        <div class="progress-bar" style="width: 100%;"></div>
+                        <div class="cell-content">${fmt(val)}</div>
+                     </td>`;
+        });
+        html += `</tr>`;
+        tfoot.innerHTML = html;
+    }
+
+    // --- 4. Interaction ---
+    function toggleFold(rowId, event) {
+        event.stopPropagation();
+        const row = document.getElementById(rowId);
+        // Check if toggle icon exists and is visible (opacity check matches leaf logic)
+        if (!row.querySelector('.toggle-icon') || row.querySelector('.toggle-icon').style.opacity === '0') return;
+
+        const isCollapsed = row.classList.contains('collapsed');
+        if (isCollapsed) {
+            row.classList.remove('collapsed');
+            document.querySelectorAll(`.child-of-${rowId}`).forEach(c => c.classList.remove('hidden'));
+        } else {
+            row.classList.add('collapsed');
+            hideDescendants(rowId);
+        }
+        updateSunburst();
+    }
+
+    function hideDescendants(parentId) {
+        document.querySelectorAll(`.child-of-${parentId}`).forEach(child => {
+            child.classList.add('hidden');
+            if (child.querySelector('.toggle-icon')) hideDescendants(child.id);
+        });
+    }
+
+    function handleCellClick(rowId, colIdx) {
+        const node = rowIdMap.get(rowId);
+        activeRowId = rowId;
+        activeRowNode = node;
+        activeColIndex = colIdx;
+        updateUI();
+    }
+
+    function updateUI() {
+        // 1. Clear all
+        document.querySelectorAll('.row-selected').forEach(el => el.classList.remove('row-selected'));
+        document.querySelectorAll('.col-selected').forEach(el => el.classList.remove('col-selected'));
+
+        // 2. Highlight Row (TR)
+        const tr = document.getElementById(activeRowId);
+        if(tr) tr.classList.add('row-selected');
+
+        // 3. Highlight Column (TDs)
+        document.querySelectorAll(`.col-${activeColIndex}`).forEach(td => td.classList.add('col-selected'));
+
+        // 4. Update Titles
+        document.getElementById('barTitle').innerHTML = `Row: <span style="color:#fff">${activeRowNode.name}</span> Analysis`;
+        const colName = columnLabels[activeColIndex].replace(/\n/g, ' ');
+        document.getElementById('sunburstTitle').innerHTML = `Column: <span style="color:#fff">${colName}</span> Breakdown`;
+
+        // 5. Refresh Charts
+        updateBarChart();
+        updateSunburst();
+    }
+
+    // --- 5. Charts ---
+    function updateBarChart() {
+        if (!barChartInst) barChartInst = echarts.init(document.getElementById('barChart'));
+
+        const data = (activeRowNode.time || activeRowNode.calculatedTime).map(v => parseFloat(fmt(v)));
+        const barColor = (activeRowNode.name === 'Total') ? '#00b894' : '#007acc';
+
+        const option = {
+            backgroundColor: 'transparent',
+            tooltip: { trigger: 'axis', axisPointer: { type: 'shadow' } },
+            grid: { top: 30, bottom: 60, left: 50, right: 20 },
+            xAxis: {
+                type: 'category',
+                data: columnLabels,
+                axisLabel: { color: '#aaa', fontSize: 10, rotate: 45, interval: 0 },
+                axisLine: { lineStyle: { color: '#444' } }
+            },
+            yAxis: { type: 'value', splitLine: { lineStyle: { color: '#333' } }, axisLabel: { color: '#aaa' } },
+            series: [{
+                data: data,
+                type: 'bar',
+                itemStyle: { color: barColor, borderRadius: [3,3,0,0] },
+                markPoint: {
+                    data: [{ coord: [activeColIndex, data[activeColIndex]], value: 'Selected', itemStyle: { color: '#ff6b6b' }, label: { color: '#fff' } }],
+                    animationDuration: 300,
+                    animationDurationUpdate: 300
+                }
+            }]
+        };
+        barChartInst.setOption(option);
+
+        barChartInst.off('click');
+        barChartInst.on('click', function(params) {
+            if (params.dataIndex !== undefined) {
+                handleCellClick(activeRowId, params.dataIndex);
+            }
+        });
+    }
+
+    function updateSunburst() {
+        if (!sunburstChartInst) sunburstChartInst = echarts.init(document.getElementById('sunburstChart'));
+
+        function build(nodes, parentIdPrefix="row") {
+            const res = [];
+            nodes.forEach((node, idx) => {
+                const currentId = parentIdPrefix === "row" ? `row_${idx}` : `${parentIdPrefix}_${idx}`;
+                const domRow = document.getElementById(currentId);
+                if (!domRow) return;
+
+                const isCollapsed = domRow.classList.contains('collapsed');
+                const isLeaf = !node.children || node.children.length === 0;
+                const val = parseFloat(fmt((node.time || node.calculatedTime)[activeColIndex]));
+                const isSelected = (currentId === activeRowId);
+
+                const item = {
+                    name: node.name,
+                    value: val,
+                    rowId: currentId,
+                    itemStyle: {
+                        borderColor: isSelected ? '#ff4d4f' : '#1e1e1e',
+                        borderWidth: isSelected ? 3 : 1,
+                        color: isSelected ? '#ff7875' : undefined
+                    },
+                    label: { color: isSelected ? '#000' : '#eee' }
+                };
+
+                if (!isLeaf && !isCollapsed) {
+                    item.children = build(node.children, currentId);
+                } else {
+                    item.value = val;
+                }
+                res.push(item);
+            });
+            return res;
+        }
+
+        const data = build(rawData);
+
+        const option = {
+            backgroundColor: 'transparent',
+            tooltip: {
+                trigger: 'item',
+            },
+            series: {
+                type: 'sunburst',
+                data: data,
+                radius: ['15%', '90%'],
+                sort: null,
+                label: { rotate: 'radial', minAngle: 5 },
+                itemStyle: { borderRadius: 2 },
+                emphasis: { focus: 'ancestor' },
+                animationDuration: 0,
+                color: ['#a63a3a', '#2f4554', '#266b4b', '#a88432', '#603675', '#3b5090']
+            }
+        };
+        sunburstChartInst.setOption(option);
+
+        sunburstChartInst.off('click');
+        sunburstChartInst.on('click', function(params) {
+            if (params.data && params.data.rowId) {
+                handleCellClick(params.data.rowId, activeColIndex);
+                const tr = document.getElementById(params.data.rowId);
+                if(tr) tr.scrollIntoView({block: 'center', behavior: 'smooth'});
+            }
+        });
+    }
+
+    // --- 6. Init & Controls ---
+    function expandAll() {
+        document.querySelectorAll('tr').forEach(tr => tr.classList.remove('collapsed', 'hidden'));
+        updateSunburst();
+    }
+    function collapseAll() {
+        document.querySelectorAll('.row-parent').forEach(tr => tr.classList.add('collapsed'));
+        document.querySelectorAll('tr[data-level]').forEach(tr => { if (tr.dataset.level > 0) tr.classList.add('hidden'); });
+        updateSunburst();
+    }
+    function expandToLevel(lvl) {
+        collapseAll();
+        document.querySelectorAll('tr').forEach(tr => {
+            const rLvl = parseInt(tr.dataset.level);
+            if (rLvl < lvl) tr.classList.remove('collapsed');
+            if (rLvl <= lvl) tr.classList.remove('hidden');
+        });
+        updateSunburst();
+    }
+
+    // Keyboard Navigation
+    document.addEventListener('keydown', (e) => {
+        if (e.ctrlKey || e.altKey || e.metaKey) return;
+
+        // 1. Handle Expand (+) and Collapse (-)
+        if (e.key === '+' || e.key === '=') { // + or = for expand
+            const row = document.getElementById(activeRowId);
+            // Only expand if collapsed
+            if (row && row.classList.contains('collapsed')) {
+                toggleFold(activeRowId, { stopPropagation: () => {} });
+                return; // Done
+            }
+        }
+        if (e.key === '-' || e.key === '_') { // - or _ for collapse
+            const row = document.getElementById(activeRowId);
+            // Only collapse if NOT collapsed
+            if (row && !row.classList.contains('collapsed')) {
+                toggleFold(activeRowId, { stopPropagation: () => {} });
+                return; // Done
+            }
+        }
+
+        // 2. Handle Navigation (Arrows)
+        const visibleRows = Array.from(document.querySelectorAll('#tableBody tr:not(.hidden)'));
+        if (visibleRows.length === 0) return;
+
+        let currentRowIdx = visibleRows.findIndex(tr => tr.id === activeRowId);
+        if (activeRowId === 'row_total') {
+             if (e.key === 'ArrowUp') currentRowIdx = visibleRows.length;
+        }
+
+        if (currentRowIdx === -1 && activeRowId !== 'row_total') currentRowIdx = 0;
+
+        const totalCols = columnLabels.length;
+        let handled = false;
+        let newRowId = activeRowId;
+        let newColIdx = activeColIndex;
+
+        if (e.key === 'ArrowUp') {
+            if (activeRowId === 'row_total') {
+                newRowId = visibleRows[visibleRows.length - 1].id;
+                handled = true;
+            } else if (currentRowIdx > 0) {
+                newRowId = visibleRows[currentRowIdx - 1].id;
+                handled = true;
+            }
+        } else if (e.key === 'ArrowDown') {
+            if (currentRowIdx < visibleRows.length - 1) {
+                newRowId = visibleRows[currentRowIdx + 1].id;
+                handled = true;
+            } else if (currentRowIdx === visibleRows.length - 1) {
+                newRowId = 'row_total';
+                handled = true;
+            }
+        } else if (e.key === 'ArrowLeft') {
+            if (activeColIndex > 0) {
+                newColIdx = activeColIndex - 1;
+                handled = true;
+            }
+        } else if (e.key === 'ArrowRight') {
+            if (activeColIndex < totalCols - 1) {
+                newColIdx = activeColIndex + 1;
+                handled = true;
+            }
+        }
+
+        if (handled) {
+            e.preventDefault();
+            handleCellClick(newRowId, newColIdx);
+            const tr = document.getElementById(newRowId);
+            if (tr) tr.scrollIntoView({block: 'nearest'});
+        }
+    });
+
+    window.onload = () => {
+        renderHeader();
+        document.getElementById('tableBody').innerHTML = renderTableBody(rawData, 0);
+        renderTotalRow();
+        handleCellClick('row_0', 0);
+
+        window.addEventListener('resize', () => {
+            barChartInst && barChartInst.resize();
+            sunburstChartInst && sunburstChartInst.resize();
+        });
+    };
+
+    function initLevelButtons() {
+        function getDepth(node) {
+            if (!node.children || node.children.length === 0) return 0;
+            let max = 0;
+            for (let child of node.children) {
+                max = Math.max(max, getDepth(child));
+            }
+            return max + 1;
+        }
+        let maxDepth = 0;
+        for (let node of rawData) {
+            maxDepth = Math.max(maxDepth, getDepth(node));
+        }
+        console.log(maxDepth);
+
+        const container = document.getElementById('level-buttons');
+        if (container) {
+            container.innerHTML = '';
+            for (let i = 1; i < maxDepth; i++) {
+                 const btn = document.createElement('button');
+                 btn.innerText = 'Level ' + i;
+                 btn.onclick = function() { expandToLevel(i); };
+                 // Copy styles from other buttons if possible, or rely on CSS
+                 // The existing buttons don't have inline styles, just CSS class likely.
+                 // But wait, <button> elements in this file might be styled by tag selector.
+                 container.appendChild(btn);
+                 // Add a space
+                 container.appendChild(document.createTextNode(' '));
+            }
+        }
+    }
+    // Execute initialization
+    initLevelButtons();
+</script>
+</body>
+</html>
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py
index 607e110b62e..e347df3ca8e 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py
@@ -1,7 +1,5 @@
+from .mark_utils import mark_ranges
 from .runner_factory import get_runner_cls
 from .runner_interface import BalanceMethod
 
-__all__ = [
-    "BalanceMethod",
-    "get_runner_cls",
-]
+__all__ = ["BalanceMethod", "get_runner_cls", "mark_ranges"]
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py
new file mode 100644
index 00000000000..72625d10598
--- /dev/null
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py
@@ -0,0 +1,24 @@
+import nvtx
+
+from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3Gate
+from tensorrt_llm._torch.models.modeling_qwen3_next import (
+    Qwen3NextGatedDeltaNet,
+    Qwen3NextSparseMoeBlock,
+)
+from tensorrt_llm._torch.modules.attention import MLA, Attention
+from tensorrt_llm._torch.modules.fused_moe.interface import MoE
+from tensorrt_llm._torch.modules.gated_mlp import GatedMLP
+
+
+def mark_ranges():
+    DeepseekV3Gate.forward = nvtx.annotate("DeepseekV3Gate")(DeepseekV3Gate.forward)
+    Qwen3NextGatedDeltaNet.forward = nvtx.annotate("Qwen3NextGatedDeltaNet")(
+        Qwen3NextGatedDeltaNet.forward
+    )
+    Qwen3NextSparseMoeBlock.forward = nvtx.annotate("Qwen3NextSparseMoeBlock")(
+        Qwen3NextSparseMoeBlock.forward
+    )
+    MLA.forward = nvtx.annotate("MLA")(MLA.forward)
+    Attention.forward = nvtx.annotate("Attention")(Attention.forward)
+    MoE.forward = nvtx.annotate("MoE")(MoE.forward)
+    GatedMLP.forward = nvtx.annotate("GatedMLP")(GatedMLP.forward)

From 4029ab5cff30921ee6cb10bf6e47c47915c359a4 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 04:39:21 +0000
Subject: [PATCH 10/16] Add csv, add tests

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md      | 16 +++++-
 examples/layer_wise_benchmarks/parse.py       | 57 +++++++++++++++----
 examples/layer_wise_benchmarks/template.html  | 26 +++++++--
 .../tools/test_layer_wise_benchmarks.py       | 18 ++++++
 4 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index d0ebcf081a8..33d88e4f973 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -128,7 +128,21 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
 
 ## Parse profiles
 
-Coming soon.
+Run the following command in the container:
+
+```bash
+python3 parse.py --world-size 4
+
+# Specify the location of the .nsys-rep file
+python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0
+```
+
+It can parse only GEN phase profiles for now.
+
+You will receive three reports, each containing kernel timing statistics grouped by module:
+1. A printed report on stdout
+2. A CSV report at `profiles/report_np4_rank0.csv`
+3. An HTML report at `profiles/report_np4_rank0.html`
 
 ## Trouble shooting
 
diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index e979c6105e6..af014edbe20 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -1,5 +1,6 @@
 import argparse
 import bisect
+import csv
 import json
 import re
 import sqlite3
@@ -15,6 +16,7 @@
 parser.add_argument("--profile-dir", type=str, default="profiles")
 parser.add_argument("--world-size", "--np", type=int)
 parser.add_argument("--rank", type=int, default=0)
+parser.add_argument("--warmup-times", type=int)
 group = parser.add_mutually_exclusive_group(required=False)
 group.add_argument("--error-on-unknown-kernel", action="store_true", dest="error_on_unknown_kernel")
 group.add_argument(
@@ -87,6 +89,7 @@ def shortest_common_supersequence(a, b):
 profile_dir = Path(args.profile_dir)
 nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
 sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite"
+csv_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.csv"
 html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html"
 lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path)
 
@@ -301,6 +304,9 @@ def shortest_common_supersequence(a, b):
     ("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
     ("torchMul", "at::native::binary_internal::MulFunctor<"),
     ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"),
+    ("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"),
+    ("routingIndicesHistogram", "routingIndicesHistogramKernel<"),
+    ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"),
 ]
 warned_names = set()
 
@@ -319,19 +325,39 @@ def parse_kernel_name(demangledName):
         warned_names.add(name)
         if args.error_on_unknown_kernel:
             raise NotImplementedError(f"Unknown kernel name: {name}")
-    return name[:20]
+    return name[:30]
 
 
 converted_seqs = []
 for runs in kernels:
+    warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times
     converted_seq = []
+    # Kernel time
     for i, (demangledName, _, _, ranges) in enumerate(runs[0]):
         name = parse_kernel_name(demangledName)
         category = (*ranges, name)
         time_list = [run[i][2] - run[i][1] for run in runs]
-        time_list = time_list[run_args["warmup_times"] :]
-        t = np.mean(time_list).tolist()
+        t = np.mean(time_list[warmup_times:]).tolist()
         converted_seq.append((category, t))
+    # Space and Overlap
+    overlap_list = []
+    space_list = []
+    for run in runs:
+        sorted_run = sorted(run, key=lambda op: op[1])
+        last_end = sorted_run[0][1]
+        overlap_time = 0
+        space_time = 0
+        for _, start, end, _ in sorted_run:
+            if start > last_end:
+                space_time += start - last_end
+            else:
+                overlap_time += min(last_end, end) - start
+            last_end = max(last_end, end)
+        overlap_list.append(-overlap_time)
+        space_list.append(space_time)
+    converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist()))
+    converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
+    converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
     converted_seqs.append(converted_seq)
 
 merged_title = []
@@ -355,6 +381,7 @@ def parse_kernel_name(demangledName):
     )
 
 stack = []
+csv_data = [["", *[problem["text"] for problem in problem_set]]]
 js_data = []
 js_stack = [js_data]
 max_title_len = max((len(title) - 1) * 3 + len(title[-1]) for title in merged_title)
@@ -374,20 +401,26 @@ def parse_kernel_name(demangledName):
         stack.append(level_title)
         level = len(stack)
         print("|--" * (level - 1) + level_title)
+        csv_data.append(["|--" * (level - 1) + level_title])
         js_stack.append([])
     level = len(stack) + 1
     print(
         "|--" * (level - 1) + title[-1] + " " * (max_title_len - (level - 1) * 3 - len(title[-1])),
-        *[f"{x / 1000:-6.2f}" for x in time_data],
-    )
-    js_stack[-1].append(
-        {
-            "name": title[-1],
-            "time": [x / 1000 for x in time_data],
-        }
+        *[f"{x / 1000:-6.1f}" for x in time_data],
     )
-# TODO: Print overlap and space
-# TODO: Statistics
+    csv_data.append(["|--" * (level - 1) + title[-1], *[f"{x / 1000:.1f}" for x in time_data]])
+    if title != ("Total",):
+        js_stack[-1].append(
+            {
+                "name": title[-1],
+                "time": [x / 1000 for x in time_data],
+            }
+        )
+# TODO: Group repeated modules
+with csv_file_path.open("w", newline="") as f:
+    csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
+    for row in csv_data:
+        csv_writer.writerow(row)
 js_header_config = [{"name": problem["text"]} for problem in problem_set]
 loader = jinja2.FileSystemLoader(Path(__file__).parent)
 template = jinja2.Environment(loader=loader).get_template("template.html")
diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html
index 43f1d915a02..b076291322b 100644
--- a/examples/layer_wise_benchmarks/template.html
+++ b/examples/layer_wise_benchmarks/template.html
@@ -4,7 +4,11 @@
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>CUDA Kernel Latency Dashboard</title>
-    <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
+    <script
+        src="https://cdn.jsdelivr.net/npm/echarts@6.0.0/dist/echarts.min.js"
+        integrity="sha384-F07Cpw5v8spSU0H113F33m2NQQ/o6GqPTnTjf45ssG4Q6q58ZwhxBiQtIaqvnSpR"
+        crossorigin="anonymous">
+    </script>
     <style>
         :root {
             --bg-color: #1e1e1e;
@@ -380,7 +384,7 @@
 
                 html += `<td class="data-cell col-${colIdx}" onclick="handleCellClick('${uniqueId}', ${colIdx})">
                             <div class="progress-bar" style="width: ${pct}%;"></div>
-                            <div class="cell-content">${fmt(val)}</div>
+                            <div class="cell-content">${val ? fmt(val) : ""}</div>
                          </td>`;
             });
             html += `</tr>`;
@@ -467,12 +471,19 @@
     function updateBarChart() {
         if (!barChartInst) barChartInst = echarts.init(document.getElementById('barChart'));
 
-        const data = (activeRowNode.time || activeRowNode.calculatedTime).map(v => parseFloat(fmt(v)));
+        const data = activeRowNode.time || activeRowNode.calculatedTime;
         const barColor = (activeRowNode.name === 'Total') ? '#00b894' : '#007acc';
 
         const option = {
             backgroundColor: 'transparent',
-            tooltip: { trigger: 'axis', axisPointer: { type: 'shadow' } },
+            tooltip: {
+                trigger: 'axis',
+                axisPointer: { type: 'shadow' },
+                formatter: function(params) {
+                    const param = params[0];
+                    return `${param.name}<br/>${param.marker} <b>${param.value.toFixed(1)}</b>`;
+                }
+            },
             grid: { top: 30, bottom: 60, left: 50, right: 20 },
             xAxis: {
                 type: 'category',
@@ -514,7 +525,7 @@
 
                 const isCollapsed = domRow.classList.contains('collapsed');
                 const isLeaf = !node.children || node.children.length === 0;
-                const val = parseFloat(fmt((node.time || node.calculatedTime)[activeColIndex]));
+                const val = (node.time || node.calculatedTime)[activeColIndex];
                 const isSelected = (currentId === activeRowId);
 
                 const item = {
@@ -545,6 +556,9 @@
             backgroundColor: 'transparent',
             tooltip: {
                 trigger: 'item',
+                formatter: function(param) {
+                    return `${param.name}<br/>${param.marker} <b>${param.value.toFixed(1)}</b>`;
+                }
             },
             series: {
                 type: 'sunburst',
@@ -554,7 +568,7 @@
                 label: { rotate: 'radial', minAngle: 5 },
                 itemStyle: { borderRadius: 2 },
                 emphasis: { focus: 'ancestor' },
-                animationDuration: 0,
+                animationDuration: 300,
                 color: ['#a63a3a', '#2f4554', '#266b4b', '#a88432', '#603675', '#3b5090']
             }
         };
diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py
index 56d73c08952..affc946bb53 100644
--- a/tests/unittest/tools/test_layer_wise_benchmarks.py
+++ b/tests/unittest/tools/test_layer_wise_benchmarks.py
@@ -11,6 +11,7 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size):
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"needs {world_size:d} GPUs to run this test")
     model_root = llm_models_root(check=True)
+    profile_dir = f"profiles/test_deepseek_r1_ctx_dep_{world_size}"
     check_call(
         [
             "./mpi_launch.sh",
@@ -23,6 +24,7 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size):
         env={
             **os.environ,
             "NP": f"{world_size:d}",
+            "PROFILE_DIR": profile_dir,
         },
     )
 
@@ -32,6 +34,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size):
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"needs {world_size:d} GPUs to run this test")
     model_root = llm_models_root(check=True)
+    profile_dir = f"profiles/test_deepseek_r1_ctx_tep_{world_size}"
     check_call(
         [
             "./mpi_launch.sh",
@@ -47,6 +50,7 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size):
         env={
             **os.environ,
             "NP": f"{world_size:d}",
+            "PROFILE_DIR": profile_dir,
             "TRTLLM_ENABLE_PDL": "1",
         },
     )
@@ -57,6 +61,7 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size):
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"needs {world_size:d} GPUs to run this test")
     model_root = llm_models_root(check=True)
+    profile_dir = f"profiles/test_deepseek_v32_ctx_dep_{world_size}"
     check_call(
         [
             "./mpi_launch.sh",
@@ -71,6 +76,7 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size):
         env={
             **os.environ,
             "NP": f"{world_size:d}",
+            "PROFILE_DIR": profile_dir,
         },
     )
 
@@ -80,6 +86,7 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size):
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"needs {world_size:d} GPUs to run this test")
     model_root = llm_models_root(check=True)
+    profile_dir = f"profiles/test_deepseek_r1_gen_scaled_from_16_dep_{world_size}"
     check_call(
         [
             "./mpi_launch.sh",
@@ -95,8 +102,13 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size):
         env={
             **os.environ,
             "NP": f"{world_size:d}",
+            "PROFILE_DIR": profile_dir,
         },
     )
+    check_call(
+        ["python3", "parse.py", "--profile-dir", profile_dir, f"--world-size={world_size}"],
+        cwd=llm_root / "examples" / "layer_wise_benchmarks",
+    )
 
 
 @pytest.mark.parametrize("world_size", [1, 4])
@@ -104,6 +116,7 @@ def test_qwen3_next_gen_tep(llm_root, world_size):
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"needs {world_size:d} GPUs to run this test")
     model_root = llm_models_root(check=True)
+    profile_dir = f"profiles/test_qwen3_next_gen_tep_{world_size}"
     check_call(
         [
             "./mpi_launch.sh",
@@ -120,6 +133,11 @@ def test_qwen3_next_gen_tep(llm_root, world_size):
         env={
             **os.environ,
             "NP": f"{world_size:d}",
+            "PROFILE_DIR": profile_dir,
             "TRTLLM_ENABLE_PDL": "1",
         },
     )
+    check_call(
+        ["python3", "parse.py", "--profile-dir", profile_dir, f"--world-size={world_size}"],
+        cwd=llm_root / "examples" / "layer_wise_benchmarks",
+    )

From 0663598bb32f88dac4b22dd671182274cfddb417 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 06:56:14 +0000
Subject: [PATCH 11/16] Fix --scaled-from for CUTLASS and TRTLLM backends

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/parse.py       |  7 +-
 .../layer_wise_benchmarks/runner_utils.py     | 68 +++++++++++++++----
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index af014edbe20..d9511cf4981 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -258,7 +258,7 @@ def shortest_common_supersequence(a, b):
     ("CountAndIndice", "computeCountAndIndiceDevice"),
     ("Cumsum", "computeCumsumDevice"),
     ("moveIndice", "moveIndiceDevice"),
-    ("AllToAll", "moeAllToAllKernel"),
+    ("moeAllToAll", "moeAllToAllKernel"),
     ("memsetExpertIds", "memsetExpertIdsDevice"),
     ("blockSum", "blockExpertPrefixSumKernel"),
     ("globalSum", "globalExpertPrefixSumKernel"),
@@ -307,6 +307,7 @@ def shortest_common_supersequence(a, b):
     ("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"),
     ("routingIndicesHistogram", "routingIndicesHistogramKernel<"),
     ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"),
+    ("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
 ]
 warned_names = set()
 
@@ -318,7 +319,9 @@ def parse_kernel_name(demangledName):
         return "Memset"
     name = string_ids[demangledName]
     for dst, src in parser_keywords:
-        if src in name:
+        if not isinstance(src, (tuple, list)):
+            src = [src]
+        if all(keyword in name for keyword in src):
             return dst
     if name not in warned_names:
         print(f"Unknown kernel name: {name}")
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
index accdce26c1e..fd6e679ac86 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import os
+import unittest.mock
 import weakref
 from abc import ABC, abstractmethod
 from typing import Optional
@@ -10,6 +11,8 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_wide_ep import WideEPMoE
 from tensorrt_llm._torch.modules.linear import Linear, WeightMode
 from tensorrt_llm._torch.modules.mamba.mamba2_metadata import Mamba2Metadata
@@ -224,23 +227,60 @@ def scaled_from_ctx(scaled_from, mapping, pretrained_config):
         pretrained_config.n_routed_experts = (
             pretrained_config.n_routed_experts // scaled_from * mapping.moe_ep_size
         )
-        select_alltoall_method_type_orig = WideEPMoE.select_alltoall_method_type
-
-        def select_alltoall_method_type(cls: type, mapping: Mapping, top_k: int, *args, **kwargs):
-            # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k`
-            # by replacing `top_k` with `fake_top_k`
-            if scaled_from <= top_k:
-                fake_top_k = mapping.moe_ep_size + 1
-            else:
-                fake_top_k = mapping.moe_ep_size - 1
-            assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k)
-            return select_alltoall_method_type_orig(mapping, fake_top_k, *args, **kwargs)
-
-        WideEPMoE.select_alltoall_method_type = select_alltoall_method_type
+
+        def make_select_alltoall_method_type(select_alltoall_method_type_orig):
+            def select_alltoall_method_type(
+                cls: type, mapping: Mapping, top_k: int, *args, **kwargs
+            ):
+                # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k`
+                # by replacing `top_k` with `fake_top_k`
+                if scaled_from <= top_k:
+                    fake_top_k = mapping.moe_ep_size + 1
+                else:
+                    fake_top_k = mapping.moe_ep_size - 1
+                assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k)
+                return select_alltoall_method_type_orig(mapping, fake_top_k, *args, **kwargs)
+
+            return select_alltoall_method_type
+
+        def make_select_alltoall_method_type_2(select_alltoall_method_type_orig):
+            def select_alltoall_method_type(self):
+                # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k`
+                # by replacing `top_k` with `fake_top_k`
+                top_k = self.routing_method.top_k
+                if scaled_from <= top_k:
+                    fake_top_k = mapping.moe_ep_size + 1
+                else:
+                    fake_top_k = mapping.moe_ep_size - 1
+                assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k)
+                with unittest.mock.patch.object(
+                    self.routing_method.__class__,
+                    "experts_per_token",
+                    new_callable=unittest.mock.PropertyMock,
+                ) as mock_top_k:
+                    mock_top_k.return_value = fake_top_k
+                    return select_alltoall_method_type_orig(self)
+
+            return select_alltoall_method_type
+
+        select_alltoall_method_type_cutlass = CutlassFusedMoE.select_alltoall_method_type
+        select_alltoall_method_type_trtllm_gen = TRTLLMGenFusedMoE.select_alltoall_method_type
+        select_alltoall_method_type_wide_ep = WideEPMoE.select_alltoall_method_type
+        CutlassFusedMoE.select_alltoall_method_type = make_select_alltoall_method_type_2(
+            select_alltoall_method_type_cutlass
+        )
+        TRTLLMGenFusedMoE.select_alltoall_method_type = make_select_alltoall_method_type_2(
+            select_alltoall_method_type_trtllm_gen
+        )
+        WideEPMoE.select_alltoall_method_type = make_select_alltoall_method_type(
+            select_alltoall_method_type_wide_ep
+        )
         try:
             yield
         finally:
-            WideEPMoE.select_alltoall_method_type = select_alltoall_method_type_orig
+            CutlassFusedMoE.select_alltoall_method_type = select_alltoall_method_type_cutlass
+            TRTLLMGenFusedMoE.select_alltoall_method_type = select_alltoall_method_type_trtllm_gen
+            WideEPMoE.select_alltoall_method_type = select_alltoall_method_type_wide_ep
 
     @staticmethod
     def apply_quant_config_exclude_modules(layers, quant_config):

From a136d2b9fb02efe210b3f95683c32ccbacff70f0 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 08:36:58 +0000
Subject: [PATCH 12/16] Add CuteDSL support to parser

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/parse.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index d9511cf4981..45fc4116e5b 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -308,6 +308,15 @@ def shortest_common_supersequence(a, b):
     ("routingIndicesHistogram", "routingIndicesHistogramKernel<"),
     ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"),
     ("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
+    ("CuteDSLMoePermute", "cute_dsl::moePermuteKernel"),
+    (
+        "CuteDSLGroupedGemmSwiglu",
+        ["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_swiglu_fusion"],
+    ),
+    (
+        "CuteDSLGroupedGemmFinalize",
+        ["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_finalize_fusion"],
+    ),
 ]
 warned_names = set()
 

From 4af3c8b5f048f36c328911993628eb57c6313325 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 08:38:24 +0000
Subject: [PATCH 13/16] Add config panel to HTML

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/parse.py      |  6 +++-
 examples/layer_wise_benchmarks/template.html | 38 ++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index 45fc4116e5b..00f08aff4f4 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -437,4 +437,8 @@ def parse_kernel_name(demangledName):
 loader = jinja2.FileSystemLoader(Path(__file__).parent)
 template = jinja2.Environment(loader=loader).get_template("template.html")
 with html_file_path.open("w") as f:
-    f.write(template.render(headerConfig=js_header_config, rawData=js_data))
+    f.write(
+        template.render(
+            headerConfig=js_header_config, rawData=js_data, runArgs=json.dumps(run_args, indent=4)
+        )
+    )
diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html
index b076291322b..37017cae941 100644
--- a/examples/layer_wise_benchmarks/template.html
+++ b/examples/layer_wise_benchmarks/template.html
@@ -226,6 +226,37 @@
         .collapsed .toggle-icon { transform: rotate(-90deg); }
         .hidden { display: none; }
 
+        /* Config Panel Styles */
+        .config-panel {
+            margin-bottom: 10px;
+            font-size: 12px;
+            color: #888;
+        }
+        .config-panel details {
+            background-color: #252526;
+            border: 1px solid var(--border-color);
+            border-radius: 4px;
+            padding: 4px 8px;
+        }
+        .config-panel summary {
+            cursor: pointer;
+            font-weight: bold;
+            user-select: none;
+            outline: none;
+        }
+        .config-panel summary:hover {
+            color: var(--text-color);
+        }
+        .config-panel pre {
+            margin: 5px 0 0 0;
+            white-space: pre-wrap;
+            word-break: break-all;
+            font-family: "Consolas", "Monaco", monospace;
+            color: #aaa;
+            padding: 5px;
+            background-color: #1e1e1e;
+            border-radius: 3px;
+        }
     </style>
 </head>
 <body>
@@ -238,6 +269,13 @@
     <span style="font-size: 12px; color: #666; margin-left: 10px;">(Tip: Arrows to move, +/- to toggle, Click charts to select)</span>
 </div>
 
+<div class="config-panel">
+    <details>
+        <summary>🔧 Configuration (Click to expand)</summary>
+        <pre>{{ runArgs }}</pre>
+    </details>
+</div>
+
 <div class="dashboard-grid">
     <div class="table-panel">
         <table id="kernelTable">

From 08b9f596ad9cc50da3514051cec2f6b31d255cba Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 09:09:41 +0000
Subject: [PATCH 14/16] Fix according to coderabbitai's suggestions

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/README.md                     | 2 +-
 examples/layer_wise_benchmarks/slurm_query_container_name.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md
index 33d88e4f973..e9db0f24ac7 100644
--- a/examples/layer_wise_benchmarks/README.md
+++ b/examples/layer_wise_benchmarks/README.md
@@ -121,7 +121,7 @@ Command line arguments are comma separated, for example, `--batch-size 1,2,4`. C
 
 Run with OpenMPI:
 
-```
+```bash
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size 1,2,4 --seq-len-q 1024,8192
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size 32,64,128,256,512 --seq-len-q 1,2,3,4
 ```
diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
index 2445ceaa37c..a7f6b1ba815 100755
--- a/examples/layer_wise_benchmarks/slurm_query_container_name.sh
+++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh
@@ -13,9 +13,9 @@ fi
 
 if [ "$count" -gt 1 ]; then
     echo "Error: Multiple containers found" >&2
-    for match in "$matches"; do
+    while IFS= read -r match; do
         echo "- ${match#$prefix}" >&2
-    done
+    done <<< "$matches"
     exit 1
 fi
 

From f61103d052ed2bf80d5fe9465d646624fa23561d Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 09:58:37 +0000
Subject: [PATCH 15/16] Fix select_alltoall_method_type

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
index fd6e679ac86..8ef5266b9e4 100644
--- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
+++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py
@@ -247,7 +247,7 @@ def make_select_alltoall_method_type_2(select_alltoall_method_type_orig):
             def select_alltoall_method_type(self):
                 # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k`
                 # by replacing `top_k` with `fake_top_k`
-                top_k = self.routing_method.top_k
+                top_k = self.routing_method.experts_per_token
                 if scaled_from <= top_k:
                     fake_top_k = mapping.moe_ep_size + 1
                 else:

From 7e143ac01b48b71914d9db5c0b04eb6e3b6f9982 Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 25 Nov 2025 10:03:56 +0000
Subject: [PATCH 16/16] Add more kernels to the parser

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 examples/layer_wise_benchmarks/parse.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py
index 00f08aff4f4..36248d8f114 100644
--- a/examples/layer_wise_benchmarks/parse.py
+++ b/examples/layer_wise_benchmarks/parse.py
@@ -259,6 +259,11 @@ def shortest_common_supersequence(a, b):
     ("Cumsum", "computeCumsumDevice"),
     ("moveIndice", "moveIndiceDevice"),
     ("moeAllToAll", "moeAllToAllKernel"),
+    ("moeA2APrepareDispatch", "moe_comm::moeA2APrepareDispatchKernel"),
+    ("moeA2ADispatch", "moe_comm::moeA2ADispatchKernel"),
+    ("moeA2ASanitizeExpertIds", "moe_comm::moeA2ASanitizeExpertIdsKernel"),
+    ("moeA2APrepareCombine", "moe_comm::moeA2APrepareCombineKernel"),
+    ("moeA2ACombine", "moe_comm::moeA2ACombineKernel"),
     ("memsetExpertIds", "memsetExpertIdsDevice"),
     ("blockSum", "blockExpertPrefixSumKernel"),
     ("globalSum", "globalExpertPrefixSumKernel"),