@@ -146,6 +146,7 @@ def launch_disaggregated_llm(
146146
147147 for i , port in enumerate (ctx_ports ):
148148 env_ctx = os .environ .copy ()
149+ env_ctx ["TRTLLM_USE_UCX_KVCACHE" ] = "1"
149150 gpu_range = range (current_gpu_offset ,
150151 current_gpu_offset + ctx_total_gpus )
151152 env_ctx ["CUDA_VISIBLE_DEVICES" ] = "," .join (map (str , gpu_range ))
@@ -166,6 +167,7 @@ def launch_disaggregated_llm(
166167
167168 for i , port in enumerate (gen_ports ):
168169 env_gen = os .environ .copy ()
170+ env_ctx ["TRTLLM_USE_UCX_KVCACHE" ] = "1"
169171 gpu_range = range (current_gpu_offset ,
170172 current_gpu_offset + gen_total_gpus )
171173 env_gen ["CUDA_VISIBLE_DEVICES" ] = "," .join (map (str , gpu_range ))
@@ -1103,15 +1105,12 @@ def test_chunked_prefill(self):
11031105 },
11041106 "enable_chunked_prefill" : True ,
11051107 "max_num_tokens" : 256 ,
1106- "max_batch_size" :
1107- 1 , # max_batch_size=1 will stabilize the accuracy test result at a cost of speed
11081108 }
11091109 gen_server_config = {
11101110 "cuda_graph_config" : None ,
11111111 "cache_transceiver_config" : {
11121112 "backend" : "DEFAULT"
1113- },
1114- "max_batch_size" : 1 ,
1113+ }
11151114 }
11161115 disaggregated_server_config = {
11171116 "hostname" : "localhost" ,
0 commit comments