[TRTLLM-8274][feat] Check if executor is shutdown in /health entrypoint

JunyiXu-nv · JunyiXu-nv · commit 6909e1726f7d · 2025-11-13T16:20:35.000+08:00
Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -766,6 +766,12 @@ def shutdown(self) -> None:
             self.mpi_session.shutdown()
             self.mpi_session = None
 
+    def check_health(self) -> bool:
+        if hasattr(self, "_executor") and self._executor is not None:
+            return not self._executor.is_shutdown()
+
+        return False
+
     @staticmethod
     def _shutdown_wrapper(self_ref):
         # Retrieve the instance if it still exists
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -233,6 +233,9 @@ def _create_response_id_not_found_error(self, response_id: str) -> Response:
             status_code=HTTPStatus.NOT_FOUND,
         )
 
+    def _check_health(self) -> bool:
+        return self.llm.check_health()
+
     def register_routes(self):
         self.app.add_api_route("/health", self.health, methods=["GET"])
         self.app.add_api_route("/health_generate", self.health_generate, methods=["GET"])
@@ -293,7 +296,10 @@ def register_mm_encoder_routes(self):
                                methods=["POST"])
 
     async def health(self) -> Response:
-        return Response(status_code=200)
+        if self._check_health():
+            return Response(status_code=200)
+        else:
+            return Response(status_code=503, content="LLM is unavailable. Please check the server logs for more details.")
 
     async def health_generate(self, raw_request: Request) -> Response:
         """Health check that performs a minimal generation."""
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -2,7 +2,6 @@
 
 import pytest
 from fastapi.testclient import TestClient
-from transformers import AutoTokenizer
 
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
@@ -14,26 +13,36 @@
 
 
 @pytest.fixture(scope="module")
-def client():
+def llm():
     build_config = BuildConfig()
     build_config.max_batch_size = 8
     build_config.max_seq_len = 512
     llm = PyTorchLLM(model=llama_model_path,
                      build_config=build_config,
                      kv_cache_config=KvCacheConfig(),
                      enable_iter_perf_stats=True)
-    hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
+    yield llm
+    llm.shutdown()
 
+
+@pytest.fixture(scope="module")
+def client(llm):
     app_instance = OpenAIServer(llm,
                                 model=llama_model_path,
-                                hf_tokenizer=hf_tokenizer)
+                                tool_parser=None,
+                                server_role=None,
+                                metadata_server_cfg=None)
     client = TestClient(app_instance.app)
     yield client
 
 
-def test_health(client):
+@pytest.mark.parametrize("is_healthy,response_code", [(True, 200),
+                                                      (False, 503)])
+def test_health(client, llm, is_healthy, response_code):
+    if not is_healthy:
+        llm.shutdown()
     response = client.get("/health")
-    assert response.status_code == 200
+    assert response.status_code == response_code
 
 
 def test_version(client):