[None][doc] Update DS-R1 example doc (#9231)

jiahanc · web-flow · commit 255e4ea9f00f · 2025-11-18T21:10:02.000-08:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
@@ -247,6 +247,7 @@ cuda_graph_config:
     max_batch_size: 1024
 enable_attention_dp: false
 kv_cache_config:
+    enable_block_reuse: false
     dtype: fp8
 stream_interval: 10
 EOF
@@ -258,22 +259,34 @@ cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
+  - 2048
   - 1024
   - 896
   - 512
+  - 384
   - 256
+  - 192
+  - 160
   - 128
+  - 96
   - 64
+  - 48
   - 32
+  - 24
   - 16
   - 8
   - 4
   - 2
   - 1
 kv_cache_config:
+  enable_block_reuse: false
   dtype: fp8
 stream_interval: 10
 enable_attention_dp: true
+attention_dp_config:
+  batching_wait_iters: 0
+  enable_balance: true
+  timeout_iters: 60
 EOF
 ```
 
@@ -285,6 +298,7 @@ cuda_graph_config:
     max_batch_size: 1024
 enable_attention_dp: false
 kv_cache_config:
+    enable_block_reuse: false
     dtype: fp8
     free_gpu_memory_fraction: 0.8
 stream_interval: 10
@@ -301,7 +315,12 @@ cuda_graph_config:
     enable_padding: true
     max_batch_size: 512
 enable_attention_dp: true
+attention_dp_config:
+  batching_wait_iters: 0
+  enable_balance: true
+  timeout_iters: 60
 kv_cache_config:
+    enable_block_reuse: false
     dtype: fp8
     free_gpu_memory_fraction: 0.8
 stream_interval: 10
@@ -316,12 +335,11 @@ trtllm-serve \
   --host localhost \
   --port 8000 \
   --backend pytorch \
-  --max_batch_size 1024 \
+  --max_batch_size 2048 \
   --max_num_tokens 8192 \
   --tp_size 8 \
   --ep_size 8 \
   --pp_size 1 \
-  --kv_cache_free_gpu_memory_fraction 0.9 \
   --extra_llm_api_options ./extra-llm-api-config.yml
 ```
 It's possible seeing OOM issues on some configs. Considering reducing `kv_cache_free_gpu_mem_fraction` to a smaller value as a workaround. We're working on the investigation and addressing the problem. If you are using max-throughput config, reduce `max_num_tokens` to `3072` to avoid OOM issues.