From e482a7167be2e9230e573b7218ea072ca65c755c Mon Sep 17 00:00:00 2001
From: Shunkang <182541032+Shunkangz@users.noreply.github.co>
Date: Mon, 10 Nov 2025 09:16:54 +0000
Subject: [PATCH] Add docs

Signed-off-by: Shunkang <182541032+Shunkangz@users.noreply.github.co>
---
 examples/llm-api/llm_kv_cache_connector.py | 89 ++++++++++++++++++++--
 1 file changed, 84 insertions(+), 5 deletions(-)

diff --git a/examples/llm-api/llm_kv_cache_connector.py b/examples/llm-api/llm_kv_cache_connector.py
index 1eac9a9cd98..2e87e0c6868 100644
--- a/examples/llm-api/llm_kv_cache_connector.py
+++ b/examples/llm-api/llm_kv_cache_connector.py
@@ -1,6 +1,84 @@
 ### :title KV Cache Connector
 ### :order 6
 ### :section Customization
+'''
+This script demonstrates the KV cache connector feature in TensorRT-LLM, which enables
+custom persistence and reuse of KV cache blocks across different LLM instances.
+
+**Scenario:**
+The script implements a persistent KV cache connector that saves computed KV cache blocks
+to disk and loads them back in subsequent runs, eliminating redundant computation for
+recurring prompts.
+
+**What is a KV Cache Connector?**
+
+A KV cache connector is a customizable interface that allows you to:
+1.  **Save KV Cache:** Persist computed KV cache blocks to an external storage
+    (disk, database, distributed cache, etc.)
+2.  **Load KV Cache:** Retrieve previously computed cache blocks instead of recomputing them
+3.  **Share Cache Across Instances:** Reuse cache blocks across different LLM instances
+    or sessions, unlike regular block reuse which is limited to a single instance
+
+**How It Works:**
+
+This example implements a `PersistentKvCacheConnector` with two key components:
+
+* **PersistentKvCacheConnectorLeader (Scheduler):**
+    - Hashes token sequences to create unique identifiers for each cache block
+    - Checks if cached blocks exist on disk for incoming requests
+    - Schedules load operations for cache hits
+    - Schedules save operations for newly computed blocks
+
+* **PersistentKvCacheConnectorWorker:**
+    - Executes the actual load/save operations between GPU and disk
+    - Loads cached blocks from disk files into GPU memory
+    - Saves newly computed blocks from GPU to disk files
+
+**Demonstration:**
+
+The script processes the same prompt twice using two separate LLM instances:
+
+1.  **First Run (Instance 1):**
+    - The LLM computes the KV cache for the input prompt
+    - The connector saves the computed cache blocks to disk (as .pt files)
+    - The generation completes and the LLM instance is destroyed
+
+2.  **Second Run (Instance 2):**
+    - A new LLM instance is created with the same connector configuration
+    - When processing the same prompt, the connector finds matching cache blocks on disk
+    - The cache is loaded from disk instead of being recomputed
+    - **Expected Outcome:** Faster prefill as cache blocks are loaded rather than computed
+    - Both outputs should be identical, demonstrating deterministic cache reuse
+
+**Key Benefits:**
+
+- **Cross-Instance Cache Sharing:** Share computed caches across multiple LLM instances
+- **Persistent Storage:** Cache survives beyond the lifetime of a single LLM instance
+- **Custom Storage Backends:** Implement any storage mechanism (shown here: disk files)
+- **Reduced Computation:** Eliminate redundant KV cache computation for repeated prompts
+
+**How to Run:**
+
+```bash
+python llm_kv_cache_connector.py <model_path>
+```
+
+Example:
+```bash
+python llm_kv_cache_connector.py meta-llama/Llama-3.1-8B-Instruct
+```
+
+**Implementation Notes:**
+
+- This example uses content-based hashing to identify cache blocks
+- Cache files are stored in a temporary directory (cleaned up after the demo)
+- The implementation is simplified and not optimized for production use
+- Does not support chunked prefill in this example
+- See `tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py` for the full connector interface
+
+**NOTE:** This example connector implementation is designed for demonstration purposes
+and is NOT suitable for production use without additional optimizations and error handling.
+'''
 
 import os
 import sys
@@ -17,11 +95,6 @@
 from tensorrt_llm.bindings.internal.batch_manager import LlmRequest
 from tensorrt_llm.llmapi.llm_args import KvCacheConnectorConfig, TorchLlmArgs
 
-# This is a simple example of the use of the KV cache connector.
-# It persists KV cache contents into a folder, and can load them back on subsequent runs.
-# See tensorrt_llm/_torch/pyexecutor/connector.py for details about the KV cache connector interface.
-# NOTE: This example connector implementation is NOT suitable for production use.
-
 CONNECTOR_CACHE_FOLDER_KEY = "CONNECTOR_CACHE_FOLDER"
 
 
@@ -198,6 +271,7 @@ def main(model: str):
 
     this_module = __file__[__file__.rfind("/") + 1:__file__.rfind(".py")]
 
+    # --- KV Cache Connector Config ---
     kv_connector_config = KvCacheConnectorConfig(
         connector_module=this_module,
         connector_scheduler_class="PersistentKvCacheConnectorLeader",
@@ -207,6 +281,7 @@ def main(model: str):
     connector_cache_dir = TemporaryDirectory()
     os.environ[CONNECTOR_CACHE_FOLDER_KEY] = connector_cache_dir.name
 
+    # Create LLM instance with KV Cache Connector
     llm = LLM(model=model,
               backend="pytorch",
               cuda_graph_config=None,
@@ -220,6 +295,7 @@ def main(model: str):
 
     sampling_params = SamplingParams(max_tokens=32)
 
+    # Generate text with the first LLM instance and save the kv cache blocks by the connector.
     output = llm.generate([test_text], sampling_params)
     text0 = output[0].outputs[0].text
 
@@ -228,16 +304,19 @@ def main(model: str):
 
     del llm
 
+    # Create a new LLM instance with the same connector configuration
     llm = LLM(model=model,
               backend="pytorch",
               cuda_graph_config=None,
               kv_connector_config=kv_connector_config)
 
+    # Generate text with the second LLM instance and it should reuse the kv cache blocks from the connector.
     output = llm.generate([test_text], sampling_params)
     text1 = output[0].outputs[0].text
 
     print("Second output (using connector cache): ", text1)
 
+    # Verify that the two outputs are identical
     assert text0 == text1
 
     connector_cache_dir.cleanup()