From 15a49dd0ad1a5e97c2a55026ed4ffbc0433a2633 Mon Sep 17 00:00:00 2001 From: Marius Eriksen Date: Thu, 13 Nov 2025 11:51:04 -0800 Subject: [PATCH] [monarch_hyperactor] propagate proc status in supervision events for proc failures Currently, we report the agent to be Stopped. This is accurate, but confusing, and could be better attributed. Here, we synthesize an actor failure by: 1) attributing the fault to the corresponding actor in the monitored actor mesh; 2) elevating the proc_status (which contains mode of failure, exit code, etc) into the actor failure, making it clear it is a process failure In the future: 1) We will have a more general "Failure" struct, explicitly capturing host, proc, actor, etc., failures. 2) We will attribute the actor failure (it is the most proximate), but move the proc failure to the "cause" (i.e., proc failure caused actor to fail), which is the most correct and clear. Differential Revision: [D86993889](https://our.internmc.facebook.com/intern/diff/D86993889/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D86993889/)! [ghstack-poisoned] --- .../src/v1/host_mesh/mesh_agent.rs | 2 +- monarch_hyperactor/src/v1/actor_mesh.rs | 29 +++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs b/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs index edd0e4076..9d89d64e0 100644 --- a/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs +++ b/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs @@ -355,7 +355,7 @@ impl Handler> for HostMeshAgent { cx: &Context, get_state: resource::GetState, ) -> anyhow::Result<()> { - let manager = self + let manager: Option<&BootstrapProcManager> = self .host .as_mut() .expect("host") diff --git a/monarch_hyperactor/src/v1/actor_mesh.rs b/monarch_hyperactor/src/v1/actor_mesh.rs index 85850ea0c..b8525a916 100644 --- a/monarch_hyperactor/src/v1/actor_mesh.rs +++ b/monarch_hyperactor/src/v1/actor_mesh.rs @@ -15,6 +15,7 @@ use hyperactor::Actor; use hyperactor::ActorHandle; use hyperactor::ActorRef; use hyperactor::RemoteMessage; +use hyperactor::actor::ActorErrorKind; use hyperactor::actor::ActorStatus; use hyperactor::actor::Referable; use hyperactor::actor::RemotableActor; @@ -22,6 +23,7 @@ use hyperactor::clock::Clock; use hyperactor::clock::RealClock; use hyperactor::context; use hyperactor::supervision::ActorSupervisionEvent; +use hyperactor_mesh::bootstrap::ProcStatus; use hyperactor_mesh::dashmap::DashMap; use hyperactor_mesh::proc_mesh::mesh_agent::ActorState; use hyperactor_mesh::resource; @@ -502,7 +504,7 @@ async fn actor_states_monitor( ActorSupervisionEvent::new( cx.instance().self_id().clone(), ActorStatus::generic_failure(format!( - "Unable to query for proc states: {:?}", + "unable to query for proc states: {:?}", e )), None, @@ -520,18 +522,27 @@ async fn actor_states_monitor( } if let Some(proc_states) = proc_states.unwrap() { // Check if the proc mesh is still alive. - if let Some((rank, state)) = proc_states + if let Some((point, state)) = proc_states .iter() .find(|(_rank, state)| state.status.is_terminating()) { send_state_change( - rank.rank(), + point.rank(), ActorSupervisionEvent::new( - state - .state - .map(|s| s.mesh_agent.actor_id().clone()) - .unwrap_or(cx.instance().self_id().clone()), - ActorStatus::Stopped, + // Attribute this to the monitored actor, even if the underlying + // cause is a proc_failure. We propagate the cause explicitly. + mesh.get(point.rank()).unwrap().actor_id().clone(), + // TODO: allow "actor supervision event" to be general, and + // make the proc failure the cause. + ActorStatus::Failed(ActorErrorKind::Generic(format!( + "process failure: {}", + state + .state + .and_then(|state| state.proc_status) + .unwrap_or_else(|| ProcStatus::Failed { + reason: "unknown".to_string() + }) + ))), None, None, ), @@ -555,7 +566,7 @@ async fn actor_states_monitor( ActorSupervisionEvent::new( cx.instance().self_id().clone(), ActorStatus::generic_failure(format!( - "Unable to query for actor states: {:?}", + "unable to query for actor states: {:?}", e )), None,