[https://nvbugs/5687820][fix] Remove self.abort() in DetokenizedGenerationResult (#9449)

syuoni · web-flow · commit c2562fc800a4 · 2025-11-27T22:54:40.000+08:00
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -76,6 +76,7 @@ worker_config:
       - 2048
       - 256
     print_iter_log: true
+    trust_remote_code: true
     kv_cache_config:
       enable_block_reuse: false
       free_gpu_memory_fraction: 0.8
@@ -102,6 +103,7 @@ worker_config:
     enable_attention_dp: true
     pipeline_parallel_size: 1
     print_iter_log: true
+    trust_remote_code: true
     cuda_graph_config: null
     disable_overlap_scheduler: true
     kv_cache_config:
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
@@ -52,6 +52,7 @@ for concurrency in ${concurrency_list}; do
         --dataset-path ${dataset_file} \
         --num-prompts ${num_prompts} \
         --max-concurrency ${concurrency} \
+        --trust-remote-code \
         --ignore-eos \
         --no-test-input \
         --save-result \
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh
@@ -75,6 +75,7 @@ for concurrency in ${concurrency_list}; do
         --dataset-name random \
         --num-prompts "${num_prompts}" \
         --max-concurrency "${concurrency}" \
+        --trust-remote-code \
         --ignore-eos \
         --random-input-len "${input_seq_len}" \
         --random-output-len "${output_seq_len}" \
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
@@ -105,8 +105,13 @@ def submit_job(config):
     log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}")
 
     # Get eplb num_slots for gen worker
-    eplb_num_slots = (config['worker_config']['gen'].get('moe_config', {}).get(
-        'load_balancer', {}).get('num_slots', 0))
+    load_balancer_config = config['worker_config']['gen'].get(
+        'moe_config', {}).get('load_balancer', {})
+    if isinstance(load_balancer_config, str):
+        with open(load_balancer_config, 'r') as f:
+            load_balancer_config = yaml.safe_load(f)
+    eplb_num_slots = load_balancer_config.get('num_slots', 0)
+
     # Determine directory suffix based on attention_dp
     if gen_enable_attention_dp:
         dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -761,7 +761,6 @@ def _handle_response(self, response: "GenerationExecutor.Response"):
 
                             beam_output.finish_reason = 'stop'
                             beam_output.stop_reason = stop_reason
-                            self.abort()
                             self._done = True
                             break