From 6da9fffd6c51d82c0f69bf1ece4b9b1f8de2115f Mon Sep 17 00:00:00 2001 From: Jun Li Date: Thu, 13 Nov 2025 10:18:37 -0800 Subject: [PATCH] make supervision timeout message more user friendly Summary: right now, when anything related to mesh agent happens, e.g. connection down, proc stopped, we print supervision error message like ``` monarch._rust_bindings.monarch_hyperactor.supervision.SupervisionError: Actor metatls:twshared12411.02.gtn2.facebook.com:36037,anon_0_15HL6RLpNvRw,agent[0] exited because of the following reason: ``` This message contains the actor mesh agent, which is monarch internal actor, should not be exposed to customers. This message would confuse users, thinking it is always something related to monarch internal. This diff changes the message to be more explicit for user what the next step is for investigation. New log for agent related error will look like ``` twshared234234.gtn3:23232 is not reacheable, check the log on the host for details ``` A followup diff will include a scuba link as part of the message, the scuba will show monarch error log and stderr error logs. Differential Revision: D86984496 --- hyperactor/src/supervision.rs | 48 +++++++++++++++++++------ monarch_hyperactor/src/v1/actor_mesh.rs | 23 +++++++----- python/tests/test_actor_error.py | 2 +- 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/hyperactor/src/supervision.rs b/hyperactor/src/supervision.rs index 143ee9a09..370444a11 100644 --- a/hyperactor/src/supervision.rs +++ b/hyperactor/src/supervision.rs @@ -72,23 +72,49 @@ impl ActorSupervisionEvent { status => status, } } + + /// Returns true if this is a user actor event, false if it's a system actor event. + /// System actors (like "agent") are internal to Monarch and should not be exposed to users. + pub fn is_user_actor_event(&self) -> bool { + self.actor_id.name() != "agent" + } } impl std::error::Error for ActorSupervisionEvent {} impl fmt::Display for ActorSupervisionEvent { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "{}: {} at {}", - self.actor_id, - self.actor_status, - DateTime::::from(self.occurred_at), - )?; - if let Some(message_headers) = &self.message_headers { - let headers = serde_json::to_string(&message_headers) - .expect("could not serialize message headers"); - write!(f, " (headers: {})", headers)?; + if self.is_user_actor_event() { + write!( + f, + "{}: {} at {}", + self.actor_id, + self.actor_status, + DateTime::::from(self.occurred_at), + )?; + if let Some(message_headers) = &self.message_headers { + let headers = serde_json::to_string(&message_headers) + .expect("could not serialize message headers"); + write!(f, " (headers: {})", headers)?; + } + } else { + // System actor event - show simplified message + match self.actor_id.proc_id() { + crate::reference::ProcId::Direct(addr, _) => { + write!( + f, + "{} is not reacheable, check the log on the host for details", + addr + )?; + } + crate::reference::ProcId::Ranked(_, _) => { + write!( + f, + "{} is not reacheable, check the log on the host for details", + self.actor_id.proc_id() + )?; + } + } } if let Some(caused_by) = &self.caused_by { write!(f, ": (caused by: {})", caused_by)?; diff --git a/monarch_hyperactor/src/v1/actor_mesh.rs b/monarch_hyperactor/src/v1/actor_mesh.rs index 85850ea0c..29dd089c5 100644 --- a/monarch_hyperactor/src/v1/actor_mesh.rs +++ b/monarch_hyperactor/src/v1/actor_mesh.rs @@ -442,15 +442,22 @@ fn send_state_change( } else { Unhealthy::Crashed(event.clone()) }; - let event_actor_id = event.actor_id.clone(); let py_event = PyActorSupervisionEvent::from(event.clone()); - let pyerr = PyErr::new::(format!( - "Actor {} exited because of the following reason: {}", - event_actor_id, - py_event - .__repr__() - .expect("repr failed on PyActorSupervisionEvent") - )); + let pyerr = if event.is_user_actor_event() { + PyErr::new::(format!( + "Actor {} exited because of the following reason: {}", + event.actor_id, + py_event + .__repr__() + .expect("repr failed on PyActorSupervisionEvent") + )) + } else { + PyErr::new::( + py_event + .__repr__() + .expect("repr failed on PyActorSupervisionEvent"), + ) + }; sender.send(Some(pyerr)).expect("receiver closed"); } diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py index 144d105f5..62430d277 100644 --- a/python/tests/test_actor_error.py +++ b/python/tests/test_actor_error.py @@ -692,7 +692,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh) -> None: # new call should fail with check of health state of actor mesh with pytest.raises( SupervisionError, - match="actor mesh is stopped due to proc mesh shutdown|Actor .* exited because of the following reason.*stopped", + match="actor mesh is stopped due to proc mesh shutdown|.* is not reacheable, check the log on the host for details", ): await actor_mesh.check.call()