Apply new formatter

yuantailing · yuantailing · commit cb6ed1b304d6 · 2025-11-11T07:30:23.000Z
Signed-off-by: Tailing Yuan &lt;yuantailing@gmail.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -83,7 +83,6 @@ common-files: &common_files |
         examples/infinitebench/compute_scores.py |
         examples/infinitebench/construct_synthetic_dataset.py |
         examples/infinitebench/eval_utils.py |
-        examples/layer_wise_benchmarks/run_single.py |
         examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py |
         examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py |
         examples/llm-api/_tensorrt_engine/llm_inference_customize.py |
@@ -811,7 +810,6 @@ common-files: &common_files |
         tensorrt_llm/serve/tool_parser/utils.py |
         tensorrt_llm/tools/__init__.py |
         tensorrt_llm/tools/importlib_utils.py |
-        tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py |
         tensorrt_llm/tools/multimodal_builder.py |
         tensorrt_llm/tools/onnx_utils.py |
         tensorrt_llm/tools/plugin_gen/__init__.py |
@@ -1188,7 +1186,6 @@ common-files: &common_files |
         tests/unittest/tools/plugin_gen/test_core.py |
         tests/unittest/tools/plugin_gen/test_plugin_gen.py |
         tests/unittest/tools/plugin_gen/test_shape_infer.py |
-        tests/unittest/tools/test_layer_wise_benchmarks.py |
         tests/unittest/tools/test_prepare_dataset.py |
         tests/unittest/tools/test_test_to_stage_mapping.py |
         tests/unittest/trt/__init__.py |
diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run_single.py
@@ -9,8 +9,7 @@
 from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
 from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
 from tensorrt_llm.tools.layer_wise_benchmarks.runner_base import BalanceMethod
-from tensorrt_llm.tools.layer_wise_benchmarks.runner_factory import \
-    get_runner_cls
+from tensorrt_llm.tools.layer_wise_benchmarks.runner_factory import get_runner_cls
 
 
 def comma_separated_ints(s):
@@ -24,32 +23,25 @@ def comma_separated_ints(s):
 parser.add_argument(
     "--layer-indices",
     type=comma_separated_ints,
-    help="Comma separated indices of layers, should be a contiguous range")
+    help="Comma separated indices of layers, should be a contiguous range",
+)
 parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"])
 parser.add_argument("--scaled-from", type=int)
 # KV cache related args
 parser.add_argument("--max-batch-size", type=int)
 parser.add_argument("--tokens-per-block", type=int)
 parser.add_argument("--max-seq-len", type=int)
 group = parser.add_mutually_exclusive_group(required=False)
-group.add_argument("--enable-attention-dp",
-                   action="store_true",
-                   dest="enable_attention_dp")
-group.add_argument("--no-enable-attention-dp",
-                   action="store_false",
-                   dest="enable_attention_dp")
+group.add_argument("--enable-attention-dp", action="store_true", dest="enable_attention_dp")
+group.add_argument("--no-enable-attention-dp", action="store_false", dest="enable_attention_dp")
 parser.set_defaults(enable_attention_dp=None)
 # Model init args
 parser.add_argument("--max-num-tokens", type=int)
 parser.add_argument("--moe-backend", type=str)
 parser.add_argument("--moe-max-num-tokens", type=int)
 group = parser.add_mutually_exclusive_group(required=False)
-group.add_argument("--use-cuda-graph",
-                   action="store_true",
-                   dest="use_cuda_graph")
-group.add_argument("--no-use-cuda-graph",
-                   action="store_false",
-                   dest="use_cuda_graph")
+group.add_argument("--use-cuda-graph", action="store_true", dest="use_cuda_graph")
+group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph")
 parser.set_defaults(use_cuda_graph=None)
 # Per iteration args
 parser.add_argument("--batch-size", type=int)
@@ -85,35 +77,41 @@ def comma_separated_ints(s):
     tokens_per_block=args.tokens_per_block,
     max_batch_size=args.max_batch_size,
     max_seq_len=args.max_seq_len,
-    layer_indices=args.layer_indices)
-attn_workspace = torch.empty((0, ), device="cuda", dtype=torch.int8)
+    layer_indices=args.layer_indices,
+)
+attn_workspace = torch.empty((0,), device="cuda", dtype=torch.int8)
 
 # Create other global objects
 AutoTuner.get().clear_cache()
 capture_stream = torch.cuda.Stream()
 
 # Create Runner
-runner = Runner(args.model,
-                mapping,
-                moe_backend=args.moe_backend,
-                layer_indices=args.layer_indices,
-                scaled_from=args.scaled_from,
-                max_seq_len=args.max_seq_len,
-                max_num_tokens=args.max_num_tokens,
-                moe_max_num_tokens=args.moe_max_num_tokens,
-                use_cuda_graph=args.use_cuda_graph)
+runner = Runner(
+    args.model,
+    mapping,
+    moe_backend=args.moe_backend,
+    layer_indices=args.layer_indices,
+    scaled_from=args.scaled_from,
+    max_seq_len=args.max_seq_len,
+    max_num_tokens=args.max_num_tokens,
+    moe_max_num_tokens=args.moe_max_num_tokens,
+    use_cuda_graph=args.use_cuda_graph,
+)
 
 # Warm up
 assert args.batch_size <= args.max_batch_size
 assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len
-run_pack = runner.create_run_pack(args.run_type,
-                                  batch_size=args.batch_size,
-                                  seq_len_q=args.seq_len_q,
-                                  seq_len_kv_cache=args.seq_len_kv_cache,
-                                  kv_cache_manager=kv_cache_manager,
-                                  attn_workspace=attn_workspace)
-runner.replace_routing_method(balance_method=BalanceMethod[args.balance_method],
-                              balance_ratio=args.balance_ratio)
+run_pack = runner.create_run_pack(
+    args.run_type,
+    batch_size=args.batch_size,
+    seq_len_q=args.seq_len_q,
+    seq_len_kv_cache=args.seq_len_kv_cache,
+    kv_cache_manager=kv_cache_manager,
+    attn_workspace=attn_workspace,
+)
+runner.replace_routing_method(
+    balance_method=BalanceMethod[args.balance_method], balance_ratio=args.balance_ratio
+)
 capture_stream.wait_stream(torch.cuda.current_stream())
 with torch.cuda.stream(capture_stream):
     run_pack()
@@ -127,21 +125,15 @@ def comma_separated_ints(s):
 if args.use_cuda_graph:
     with with_multi_stream(True):
         g = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(g,
-                              stream=capture_stream,
-                              capture_error_mode="global"):
+        with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
             run_pack()
 
 warmup_times = 20
 run_times = 100
-events = [
-    torch.cuda.Event(enable_timing=True)
-    for _ in range(warmup_times + run_times + 1)
-]
+events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)]
 for i in range(warmup_times + run_times):
     events[i].record()
-    with nvtx.annotate(
-            f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"):
+    with nvtx.annotate(f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"):
         if args.use_cuda_graph:
             g.replay()
         else:
@@ -151,16 +143,16 @@ def comma_separated_ints(s):
 
 # Print statistics
 #   Print before `cudaProfilerStop` to ensure messages are included in the profile
-time_list = [
-    start.elapsed_time(stop) for start, stop in zip(events, events[1:])
-]
+time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])]
 time_list = time_list[warmup_times:]
-print(f"[RANK {rank}]"
-      f"  min {np.min(time_list) * 1000:.1f}"
-      f"  max {np.max(time_list) * 1000:.1f}"
-      f"  mean {np.mean(time_list) * 1000:.1f}"
-      f"  median {np.median(time_list) * 1000:.1f}"
-      f"  P90 {np.percentile(time_list, 90) * 1000:.1f}"
-      f"  (us)")
+print(
+    f"[RANK {rank}]"
+    f"  min {np.min(time_list) * 1000:.1f}"
+    f"  max {np.max(time_list) * 1000:.1f}"
+    f"  mean {np.mean(time_list) * 1000:.1f}"
+    f"  median {np.median(time_list) * 1000:.1f}"
+    f"  P90 {np.percentile(time_list, 90) * 1000:.1f}"
+    f"  (us)"
+)
 
 torch.cuda.cudart().cudaProfilerStop()
diff --git a/pyproject.toml b/pyproject.toml
@@ -123,7 +123,6 @@ exclude = [
     "examples/infinitebench/compute_scores.py",
     "examples/infinitebench/construct_synthetic_dataset.py",
     "examples/infinitebench/eval_utils.py",
-    "examples/layer_wise_benchmarks/run_single.py",
     "examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py",
     "examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py",
     "examples/llm-api/_tensorrt_engine/llm_inference_customize.py",
@@ -851,7 +850,6 @@ exclude = [
     "tensorrt_llm/serve/tool_parser/utils.py",
     "tensorrt_llm/tools/__init__.py",
     "tensorrt_llm/tools/importlib_utils.py",
-    "tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py",
     "tensorrt_llm/tools/multimodal_builder.py",
     "tensorrt_llm/tools/onnx_utils.py",
     "tensorrt_llm/tools/plugin_gen/__init__.py",
@@ -1228,7 +1226,6 @@ exclude = [
     "tests/unittest/tools/plugin_gen/test_core.py",
     "tests/unittest/tools/plugin_gen/test_plugin_gen.py",
     "tests/unittest/tools/plugin_gen/test_shape_infer.py",
-    "tests/unittest/tools/test_layer_wise_benchmarks.py",
     "tests/unittest/tools/test_prepare_dataset.py",
     "tests/unittest/tools/test_test_to_stage_mapping.py",
     "tests/unittest/trt/__init__.py",
diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py