Add specific AD configs for nano-v3

Wanli-Jiang · Wanli-Jiang · commit 7664abbbeefb · 2025-11-10T22:14:17.000-08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore
@@ -5,3 +5,5 @@ benchmark_results.json
 # ignore config files that users might put here for debugging
 *.yaml
 !nano_v3.yaml
+!nano_v3_accuracy.yaml
+!nano_v3_bench.yaml
diff --git a/examples/auto_deploy/nano_v3_accuracy.yaml b/examples/auto_deploy/nano_v3_accuracy.yaml
@@ -0,0 +1,23 @@
+runtime: trtllm
+compile_backend: torch-cudagraph
+max_batch_size: 128
+max_seq_len: 204800
+enable_chunked_prefill: true
+attn_backend: flashinfer
+model_factory: AutoModelForCausalLM
+skip_loading_weights: false
+free_mem_ratio: 0.9
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128]
+kv_cache_config:
+  # disable kv_cache reuse since not supported for hybrid/ssm models
+  enable_block_reuse: false
+transforms:
+  detect_sharding:
+    sharding_source: ['factory', 'heuristic']
+    sharding_dims: ['ep', 'bmm']
+  # tunable mamba cache dtype
+  # --> use float32 for accuracy and default (null) for speed
+  insert_cached_ssm_attention:
+      cache_config:
+        mamba_dtype: float32
+        # mamba_dtype: null
diff --git a/examples/auto_deploy/nano_v3_bench.yaml b/examples/auto_deploy/nano_v3_bench.yaml
@@ -0,0 +1,23 @@
+runtime: trtllm
+compile_backend: torch-cudagraph
+max_batch_size: 384  # tunable
+max_seq_len: 65536 # tunable
+enable_chunked_prefill: true
+attn_backend: flashinfer
+model_factory: AutoModelForCausalLM
+skip_loading_weights: false
+free_mem_ratio: 0.9
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
+kv_cache_config:
+  # disable kv_cache reuse since not supported for hybrid/ssm models
+  enable_block_reuse: false
+transforms:
+  detect_sharding:
+    sharding_source: ['factory', 'heuristic']
+    sharding_dims: ['ep', 'bmm']
+  # tunable mamba cache dtype
+  # --> use float32 for accuracy and default (null) for speed
+  insert_cached_ssm_attention:
+      cache_config:
+        # mamba_dtype: float32
+        mamba_dtype: null