@@ -222,6 +222,7 @@ pub(crate) fn update_event_actor_id(mut event: ActorSupervisionEvent) -> ActorSu
222222 resource:: StopAll { cast = true } ,
223223 resource:: GetState <ActorState > { cast = true } ,
224224 resource:: GetRankStatus { cast = true } ,
225+ resource:: GetAllRankStatus { cast = true } ,
225226 ]
226227) ]
227228pub struct ProcMeshAgent {
@@ -274,12 +275,14 @@ impl ProcMeshAgent {
274275 proc. spawn :: < Self > ( "agent" , agent) . await
275276 }
276277
277- async fn destroy_and_wait < ' a > (
278+ async fn destroy_and_wait_except_current < ' a > (
278279 & mut self ,
279280 cx : & Context < ' a , Self > ,
280281 timeout : tokio:: time:: Duration ,
281282 ) -> Result < ( Vec < ActorId > , Vec < ActorId > ) , anyhow:: Error > {
282- self . proc . destroy_and_wait :: < Self > ( timeout, Some ( cx) ) . await
283+ self . proc
284+ . destroy_and_wait_except_current :: < Self > ( timeout, Some ( cx) , true )
285+ . await
283286 }
284287}
285288
@@ -451,9 +454,9 @@ impl Handler<ActorSupervisionEvent> for ProcMeshAgent {
451454 let event = update_event_actor_id ( event) ;
452455 if self . record_supervision_events {
453456 tracing:: info!(
454- "Received supervision event on proc {}: {:?}, recording" ,
455- self . proc . proc_id ( ) ,
456- event
457+ proc_id = % self . proc. proc_id ( ) ,
458+ %event ,
459+ "recording supervision event" ,
457460 ) ;
458461 self . supervision_events
459462 . entry ( event. actor_id . clone ( ) )
@@ -467,9 +470,9 @@ impl Handler<ActorSupervisionEvent> for ProcMeshAgent {
467470 // the whole process.
468471 tracing:: error!(
469472 name = SupervisionEventState :: SupervisionEventTransmitFailed . as_ref( ) ,
470- "proc {}: could not propagate supervision event {:?}: crashing" ,
471- cx . self_id ( ) . proc_id ( ) ,
472- event
473+ proc_id = %cx . self_id ( ) . proc_id ( ) ,
474+ %event ,
475+ "could not propagate supervision event, crashing" ,
473476 ) ;
474477
475478 // We should have a custom "crash" function here, so that this works
@@ -559,67 +562,38 @@ impl Handler<resource::CreateOrUpdate<ActorSpec>> for ProcMeshAgent {
559562#[ async_trait]
560563impl Handler < resource:: Stop > for ProcMeshAgent {
561564 async fn handle ( & mut self , cx : & Context < Self > , message : resource:: Stop ) -> anyhow:: Result < ( ) > {
562- use crate :: v1:: StatusOverlay ;
563-
564565 // We don't remove the actor from the state map, instead we just store
565566 // its state as Stopped.
566567 let actor = self . actor_states . get_mut ( & message. name ) ;
567- enum StatusOrActorId {
568- Status ( resource:: Status ) ,
569- ActorId ( ActorId ) ,
570- }
571568 // Have to separate stop_actor from setting "stopped" because it borrows
572569 // as mutable and cannot have self borrowed mutably twice.
573- let ( rank , actor_id) = match actor {
570+ let actor_id = match actor {
574571 Some ( actor_state) => {
575- let rank = actor_state. create_rank ;
576572 match & actor_state. spawn {
577573 Ok ( actor_id) => {
578574 if actor_state. stopped {
579- ( rank , StatusOrActorId :: Status ( resource :: Status :: Stopped ) )
575+ None
580576 } else {
581577 actor_state. stopped = true ;
582- ( rank , StatusOrActorId :: ActorId ( actor_id. clone ( ) ) )
578+ Some ( actor_id. clone ( ) )
583579 }
584580 }
585581 // If the original spawn had failed, the actor is still considered
586582 // successfully stopped.
587- Err ( _) => ( rank , StatusOrActorId :: Status ( resource :: Status :: Stopped ) ) ,
583+ Err ( _) => None ,
588584 }
589585 }
590586 // TODO: represent unknown rank
591- None => (
592- usize:: MAX ,
593- StatusOrActorId :: Status ( resource:: Status :: NotExist ) ,
594- ) ,
587+ None => None ,
595588 } ;
596589 let timeout = hyperactor:: config:: global:: get ( hyperactor:: config:: STOP_ACTOR_TIMEOUT ) ;
597- let status = match actor_id {
598- StatusOrActorId :: Status ( s) => s,
599- StatusOrActorId :: ActorId ( actor_id) => {
600- // If this fails, we will still leave the actor_state as stopped,
601- // because it shouldn't be attempted again.
602- let stop_result = self
603- . stop_actor ( cx, actor_id, timeout. as_millis ( ) as u64 )
604- . await ?;
605- match stop_result {
606- // use Stopped as a successful result.
607- StopActorResult :: Success => resource:: Status :: Stopped ,
608- StopActorResult :: Timeout => resource:: Status :: Timeout ( timeout) ,
609- StopActorResult :: NotFound => resource:: Status :: NotExist ,
610- }
611- }
612- } ;
613-
614- // Send a sparse overlay update. If rank is unknown, emit an
615- // empty overlay.
616- let overlay = if rank == usize:: MAX {
617- StatusOverlay :: new ( )
618- } else {
619- StatusOverlay :: try_from_runs ( vec ! [ ( rank..( rank + 1 ) , status) ] )
620- . expect ( "valid single-run overlay" )
621- } ;
622- message. reply . send ( cx, overlay) ?;
590+ if let Some ( actor_id) = actor_id {
591+ // While this function returns a Result, it never returns an Err
592+ // value so we can simply expect without any failure handling.
593+ self . stop_actor ( cx, actor_id, timeout. as_millis ( ) as u64 )
594+ . await
595+ . expect ( "stop_actor cannot fail" ) ;
596+ }
623597
624598 Ok ( ( ) )
625599 }
@@ -635,12 +609,21 @@ impl Handler<resource::StopAll> for ProcMeshAgent {
635609 let timeout = hyperactor:: config:: global:: get ( hyperactor:: config:: STOP_ACTOR_TIMEOUT ) ;
636610 // By passing in the self context, destroy_and_wait will stop this agent
637611 // last, after all others are stopped.
638- let _stop_result = self . destroy_and_wait ( cx, timeout) . await ?;
639- for ( _, actor_state) in self . actor_states . iter_mut ( ) {
640- // Mark all actors as stopped.
641- actor_state. stopped = true ;
612+ let stop_result = self . destroy_and_wait_except_current ( cx, timeout) . await ;
613+ match stop_result {
614+ Ok ( _) => {
615+ for ( _, actor_state) in self . actor_states . iter_mut ( ) {
616+ // Mark all actors as stopped.
617+ actor_state. stopped = true ;
618+ }
619+ Ok ( ( ) )
620+ }
621+ Err ( e) => Err ( anyhow:: anyhow!(
622+ "failed to StopAll on {}: {:?}" ,
623+ cx. self_id( ) ,
624+ e
625+ ) ) ,
642626 }
643- Ok ( ( ) )
644627 }
645628}
646629
@@ -713,6 +696,69 @@ impl Handler<resource::GetRankStatus> for ProcMeshAgent {
713696 }
714697}
715698
699+ #[ async_trait]
700+ impl Handler < resource:: GetAllRankStatus > for ProcMeshAgent {
701+ async fn handle (
702+ & mut self ,
703+ cx : & Context < Self > ,
704+ get_rank_status : resource:: GetAllRankStatus ,
705+ ) -> anyhow:: Result < ( ) > {
706+ use crate :: resource:: Status ;
707+
708+ let mut ranks = Vec :: new ( ) ;
709+ for ( _name, state) in self . actor_states . iter ( ) {
710+ match state {
711+ ActorInstanceState {
712+ spawn : Ok ( actor_id) ,
713+ create_rank,
714+ stopped,
715+ } => {
716+ if * stopped {
717+ ranks. push ( ( * create_rank, resource:: Status :: Stopped ) ) ;
718+ } else {
719+ let supervision_events = self
720+ . supervision_events
721+ . get ( actor_id)
722+ . map_or_else ( Vec :: new, |a| a. clone ( ) ) ;
723+ ranks. push ( (
724+ * create_rank,
725+ if supervision_events. is_empty ( ) {
726+ resource:: Status :: Running
727+ } else {
728+ resource:: Status :: Failed ( format ! (
729+ "because of supervision events: {:?}" ,
730+ supervision_events
731+ ) )
732+ } ,
733+ ) ) ;
734+ }
735+ }
736+ ActorInstanceState {
737+ spawn : Err ( e) ,
738+ create_rank,
739+ ..
740+ } => {
741+ ranks. push ( ( * create_rank, Status :: Failed ( e. to_string ( ) ) ) ) ;
742+ }
743+ }
744+ }
745+
746+ let result = get_rank_status. reply . send ( cx, ranks) ;
747+ // Ignore errors, because returning Err from here would cause the ProcMeshAgent
748+ // to be stopped, which would prevent querying and spawning other actors.
749+ // This only means some actor that requested the state of an actor failed to receive it.
750+ if let Err ( e) = result {
751+ tracing:: warn!(
752+ actor = %cx. self_id( ) ,
753+ "failed to send GetRankStatus reply to {} due to error: {}" ,
754+ get_rank_status. reply. port_id( ) . actor_id( ) ,
755+ e
756+ ) ;
757+ }
758+ Ok ( ( ) )
759+ }
760+ }
761+
716762#[ async_trait]
717763impl Handler < resource:: GetState < ActorState > > for ProcMeshAgent {
718764 async fn handle (
0 commit comments