[TRTLLM-8274][feat] Check if executor is shutdown in /health entrypoint

JunyiXu-nv · JunyiXu-nv · commit b7f5a045ecf9 · 2025-12-01T10:56:25.000+08:00
Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -766,6 +766,17 @@ def shutdown(self) -> None:
             self.mpi_session.shutdown()
             self.mpi_session = None
 
+    def _check_health(self) -> bool:
+        """Check if the LLM is healthy.
+
+        Returns:
+            bool: True if the executor is running and not shutdown, False otherwise.
+        """
+        if hasattr(self, "_executor") and self._executor is not None:
+            return not self._executor.is_shutdown()
+
+        return False
+
     @staticmethod
     def _shutdown_wrapper(self_ref):
         # Retrieve the instance if it still exists
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -236,6 +236,9 @@ def _create_response_id_not_found_error(self, response_id: str) -> Response:
             status_code=HTTPStatus.NOT_FOUND,
         )
 
+    def _check_health(self) -> bool:
+        return self.llm._check_health()
+
     def register_routes(self):
         self.app.add_api_route("/health", self.health, methods=["GET"])
         self.app.add_api_route("/health_generate", self.health_generate, methods=["GET"])
@@ -296,7 +299,10 @@ def register_mm_encoder_routes(self):
                                methods=["POST"])
 
     async def health(self) -> Response:
-        return Response(status_code=200)
+        if self._check_health():
+            return Response(status_code=200)
+        else:
+            return Response(status_code=503, content="LLM is unavailable. Please check the server logs for more details.")
 
     async def health_generate(self, raw_request: Request) -> Response:
         """Health check that performs a minimal generation."""
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1665,6 +1665,14 @@ def test_openai_responses(llm_root, llm_venv):
          str(test_root / "_test_openai_responses.py")])
 
 
+def test_openai_health(llm_root, llm_venv):
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pytest",
+        str(test_root / "_test_openai_metrics.py -k test_health")
+    ])
+
+
 def test_openai_prometheus(llm_root, llm_venv):
     test_root = unittest_path() / "llmapi" / "apps"
     llm_venv.run_cmd(
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -137,6 +137,7 @@ l0_a10:
   - llmapi/test_llm_e2e.py::test_llmapi_exit
   - llmapi/test_llm_examples.py::test_llmapi_server_example
   - llmapi/test_llm_examples.py::test_llmapi_kv_cache_connector[Qwen2-0.5B]
+  - test_e2e.py::test_openai_health
   - test_e2e.py::test_trtllm_serve_example
   - test_e2e.py::test_trtllm_serve_top_logprobs[trt]
   - test_e2e.py::test_openai_misc_example[trt]
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -1,11 +1,12 @@
 """Test the metrics endpoint when using OpenAI API to send requests"""
 
+from unittest.mock import patch
+
 import pytest
 from fastapi.testclient import TestClient
-from transformers import AutoTokenizer
 
 from tensorrt_llm import LLM as PyTorchLLM
-from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
+from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.serve.openai_server import OpenAIServer
 
 from ..test_llm import llama_model_path
@@ -14,26 +15,35 @@
 
 
 @pytest.fixture(scope="module")
-def client():
-    build_config = BuildConfig()
-    build_config.max_batch_size = 8
-    build_config.max_seq_len = 512
+def llm():
     llm = PyTorchLLM(model=llama_model_path,
-                     build_config=build_config,
                      kv_cache_config=KvCacheConfig(),
                      enable_iter_perf_stats=True)
-    hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
+    yield llm
+    llm.shutdown()
+
 
+@pytest.fixture(scope="module")
+def client(llm):
     app_instance = OpenAIServer(llm,
                                 model=llama_model_path,
-                                hf_tokenizer=hf_tokenizer)
+                                tool_parser=None,
+                                server_role=None,
+                                metadata_server_cfg=None)
     client = TestClient(app_instance.app)
     yield client
 
 
-def test_health(client):
-    response = client.get("/health")
-    assert response.status_code == 200
+@pytest.mark.parametrize("is_healthy,response_code", [(True, 200),
+                                                      (False, 503)])
+def test_health(client, llm, is_healthy, response_code):
+    if not is_healthy:
+        with patch.object(llm._executor, 'is_shutdown', return_value=True):
+            response = client.get("/health")
+            assert response.status_code == response_code
+    else:
+        response = client.get("/health")
+        assert response.status_code == response_code
 
 
 def test_version(client):