diff --git a/hyperactor/src/supervision.rs b/hyperactor/src/supervision.rs index 143ee9a09..370444a11 100644 --- a/hyperactor/src/supervision.rs +++ b/hyperactor/src/supervision.rs @@ -72,23 +72,49 @@ impl ActorSupervisionEvent { status => status, } } + + /// Returns true if this is a user actor event, false if it's a system actor event. + /// System actors (like "agent") are internal to Monarch and should not be exposed to users. + pub fn is_user_actor_event(&self) -> bool { + self.actor_id.name() != "agent" + } } impl std::error::Error for ActorSupervisionEvent {} impl fmt::Display for ActorSupervisionEvent { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "{}: {} at {}", - self.actor_id, - self.actor_status, - DateTime::::from(self.occurred_at), - )?; - if let Some(message_headers) = &self.message_headers { - let headers = serde_json::to_string(&message_headers) - .expect("could not serialize message headers"); - write!(f, " (headers: {})", headers)?; + if self.is_user_actor_event() { + write!( + f, + "{}: {} at {}", + self.actor_id, + self.actor_status, + DateTime::::from(self.occurred_at), + )?; + if let Some(message_headers) = &self.message_headers { + let headers = serde_json::to_string(&message_headers) + .expect("could not serialize message headers"); + write!(f, " (headers: {})", headers)?; + } + } else { + // System actor event - show simplified message + match self.actor_id.proc_id() { + crate::reference::ProcId::Direct(addr, _) => { + write!( + f, + "{} is not reacheable, check the log on the host for details", + addr + )?; + } + crate::reference::ProcId::Ranked(_, _) => { + write!( + f, + "{} is not reacheable, check the log on the host for details", + self.actor_id.proc_id() + )?; + } + } } if let Some(caused_by) = &self.caused_by { write!(f, ": (caused by: {})", caused_by)?; diff --git a/monarch_hyperactor/src/v1/actor_mesh.rs b/monarch_hyperactor/src/v1/actor_mesh.rs index 85850ea0c..29dd089c5 100644 --- a/monarch_hyperactor/src/v1/actor_mesh.rs +++ b/monarch_hyperactor/src/v1/actor_mesh.rs @@ -442,15 +442,22 @@ fn send_state_change( } else { Unhealthy::Crashed(event.clone()) }; - let event_actor_id = event.actor_id.clone(); let py_event = PyActorSupervisionEvent::from(event.clone()); - let pyerr = PyErr::new::(format!( - "Actor {} exited because of the following reason: {}", - event_actor_id, - py_event - .__repr__() - .expect("repr failed on PyActorSupervisionEvent") - )); + let pyerr = if event.is_user_actor_event() { + PyErr::new::(format!( + "Actor {} exited because of the following reason: {}", + event.actor_id, + py_event + .__repr__() + .expect("repr failed on PyActorSupervisionEvent") + )) + } else { + PyErr::new::( + py_event + .__repr__() + .expect("repr failed on PyActorSupervisionEvent"), + ) + }; sender.send(Some(pyerr)).expect("receiver closed"); } diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py index 144d105f5..62430d277 100644 --- a/python/tests/test_actor_error.py +++ b/python/tests/test_actor_error.py @@ -692,7 +692,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh) -> None: # new call should fail with check of health state of actor mesh with pytest.raises( SupervisionError, - match="actor mesh is stopped due to proc mesh shutdown|Actor .* exited because of the following reason.*stopped", + match="actor mesh is stopped due to proc mesh shutdown|.* is not reacheable, check the log on the host for details", ): await actor_mesh.check.call()