[monarch_hyperactor] propagate proc status in supervision events for proc failures

mariusae · mariusae · commit 40eca52ccf27 · 2025-11-14T10:37:04.000-08:00
Pull Request resolved: #1877 Currently, we report the agent to be Stopped. This is accurate, but confusing, and could be better attributed. Here, we synthesize an actor failure by: 1) attributing the fault to the corresponding actor in the monitored actor mesh; 2) elevating the proc_status (which contains mode of failure, exit code, etc) into the actor failure, making it clear it is a process failure In the future: 1) We will have a more general "Failure" struct, explicitly capturing host, proc, actor, etc., failures. 2) We will attribute the actor failure (it is the most proximate), but move the proc failure to the "cause" (i.e., proc failure caused actor to fail), which is the most correct and clear. ghstack-source-id: 323362899 @exported-using-ghexport Differential Revision: [D86993889](https://our.internmc.facebook.com/intern/diff/D86993889/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D86993889/)!
diff --git a/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs b/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs
@@ -355,7 +355,7 @@ impl Handler<resource::GetState<ProcState>> for HostMeshAgent {
         cx: &Context<Self>,
         get_state: resource::GetState<ProcState>,
     ) -> anyhow::Result<()> {
-        let manager = self
+        let manager: Option<&BootstrapProcManager> = self
             .host
             .as_mut()
             .expect("host")
diff --git a/monarch_hyperactor/src/v1/actor_mesh.rs b/monarch_hyperactor/src/v1/actor_mesh.rs
@@ -15,13 +15,15 @@ use hyperactor::Actor;
 use hyperactor::ActorHandle;
 use hyperactor::ActorRef;
 use hyperactor::RemoteMessage;
+use hyperactor::actor::ActorErrorKind;
 use hyperactor::actor::ActorStatus;
 use hyperactor::actor::Referable;
 use hyperactor::actor::RemotableActor;
 use hyperactor::clock::Clock;
 use hyperactor::clock::RealClock;
 use hyperactor::context;
 use hyperactor::supervision::ActorSupervisionEvent;
+use hyperactor_mesh::bootstrap::ProcStatus;
 use hyperactor_mesh::dashmap::DashMap;
 use hyperactor_mesh::proc_mesh::mesh_agent::ActorState;
 use hyperactor_mesh::resource;
@@ -466,7 +468,7 @@ fn send_state_change<F>(
 /// * is_owned is true if this monitor is running on the owning instance. When true,
 ///   a message will be sent to "owner" if it is not None. If owner is None,
 ///   then a panic will be raised instead to crash the client.
-/// * time_between_tasks controls how frequently to poll.
+/// * time_between_tasks 1trols how frequently to poll.
 async fn actor_states_monitor<A, F>(
     cx: &impl context::Actor,
     mesh: ActorMeshRef<A>,
@@ -502,7 +504,7 @@ async fn actor_states_monitor<A, F>(
                 ActorSupervisionEvent::new(
                     cx.instance().self_id().clone(),
                     ActorStatus::generic_failure(format!(
-                        "Unable to query for proc states: {:?}",
+                        "unable to query for proc states: {:?}",
                         e
                     )),
                     None,
@@ -520,18 +522,27 @@ async fn actor_states_monitor<A, F>(
         }
         if let Some(proc_states) = proc_states.unwrap() {
             // Check if the proc mesh is still alive.
-            if let Some((rank, state)) = proc_states
+            if let Some((point, state)) = proc_states
                 .iter()
                 .find(|(_rank, state)| state.status.is_terminating())
             {
                 send_state_change(
-                    rank.rank(),
+                    point.rank(),
                     ActorSupervisionEvent::new(
-                        state
-                            .state
-                            .map(|s| s.mesh_agent.actor_id().clone())
-                            .unwrap_or(cx.instance().self_id().clone()),
-                        ActorStatus::Stopped,
+                        // Attribute this to the monitored actor, even if the underlying
+                        // cause is a proc_failure. We propagate the cause explicitly.
+                        mesh.get(point.rank()).unwrap().actor_id().clone(),
+                        // TODO: allow "actor supervision event" to be general, and
+                        // make the proc failure the cause.
+                        ActorStatus::Failed(ActorErrorKind::Generic(format!(
+                            "process failure: {}",
+                            state
+                                .state
+                                .and_then(|state| state.proc_status)
+                                .unwrap_or_else(|| ProcStatus::Failed {
+                                    reason: "unknown".to_string()
+                                })
+                        ))),
                         None,
                         None,
                     ),
@@ -555,7 +566,7 @@ async fn actor_states_monitor<A, F>(
                 ActorSupervisionEvent::new(
                     cx.instance().self_id().clone(),
                     ActorStatus::generic_failure(format!(
-                        "Unable to query for actor states: {:?}",
+                        "unable to query for actor states: {:?}",
                         e
                     )),
                     None,
diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py
@@ -652,6 +652,7 @@ async def test_sigsegv_handling():
 
     # Depending on the timing, any of these messages could come back first.
     error_msg = (
+        "actor mesh is stopped due to proc mesh shutdown|"
         'actor mesh is stopped due to proc mesh shutdown.*Failed\\("Killed\\(sig=11.*\\)"\\)|'
         "Actor .* exited because of the following reason|"
         "Actor .* is unhealthy with reason"
@@ -692,7 +693,8 @@ async def test_supervision_with_proc_mesh_stopped(mesh) -> None:
     # new call should fail with check of health state of actor mesh
     with pytest.raises(
         SupervisionError,
-        match="actor mesh is stopped due to proc mesh shutdown|Actor .* exited because of the following reason.*stopped",
+        match="actor mesh is stopped due to proc mesh shutdown"
+        + "|Actor .* exited because of the following reason.*process failure",
     ):
         await actor_mesh.check.call()