Skip to content

Commit a03cc4a

Browse files
pzhan9facebook-github-bot
authored andcommitted
Add asserts for actor status transition (#1825)
Summary: Adding asserts so we can be more certain on the the actor status transition. Differential Revision: D86772033
1 parent 6273659 commit a03cc4a

File tree

2 files changed

+52
-20
lines changed

2 files changed

+52
-20
lines changed

hyperactor/src/actor.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,12 @@ pub enum ActorStatus {
481481
impl ActorStatus {
482482
/// Tells whether the status is a terminal state.
483483
pub fn is_terminal(&self) -> bool {
484-
matches!(self, Self::Stopped | Self::Failed(_))
484+
self.is_stopped() || self.is_failed()
485+
}
486+
487+
/// Tells whether the status is a Stopped state.
488+
pub fn is_stopped(&self) -> bool {
489+
matches!(self, Self::Stopped)
485490
}
486491

487492
/// Tells whether the status represents a failure.

hyperactor/src/proc.rs

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,14 @@ impl<A: Actor> Instance<A> {
10081008
self.status_tx.send_replace(new.clone());
10091009
}
10101010

1011+
fn is_terminal(&self) -> bool {
1012+
self.status_tx.borrow().is_terminal()
1013+
}
1014+
1015+
fn is_stopped(&self) -> bool {
1016+
self.status_tx.borrow().is_stopped()
1017+
}
1018+
10111019
/// This instance's actor ID.
10121020
pub fn self_id(&self) -> &ActorId {
10131021
self.mailbox.actor_id()
@@ -1099,21 +1107,35 @@ impl<A: Actor> Instance<A> {
10991107
.run_actor_tree(&mut actor, actor_loop_receivers, &mut work_rx)
11001108
.await;
11011109

1102-
let (actor_status, event) = match result {
1103-
Ok(_) => (ActorStatus::Stopped, None),
1104-
Err(ActorError {
1105-
kind: box ActorErrorKind::UnhandledSupervisionEvent(event),
1106-
..
1107-
}) => (event.actor_status.clone(), Some(event)),
1108-
Err(err) => (
1109-
ActorStatus::Failed(err.to_string()),
1110-
Some(ActorSupervisionEvent::new(
1111-
self.cell.actor_id().clone(),
1112-
ActorStatus::Failed(err.to_string()),
1113-
None,
1114-
None,
1115-
)),
1116-
),
1110+
let event = match result {
1111+
Ok(_) => {
1112+
// actor should have been stopped by run_actor_tree
1113+
assert!(self.is_stopped());
1114+
None
1115+
}
1116+
Err(err) => {
1117+
assert!(!self.is_terminal());
1118+
match *err.kind {
1119+
ActorErrorKind::UnhandledSupervisionEvent(event) => {
1120+
// Currently only terminated actors are allowed to raise supervision events.
1121+
// If we want to change that in the future, we need to modify the exit
1122+
// status here too, because we use event's actor_status as this actor's
1123+
// terminal status.
1124+
assert!(event.actor_status.is_terminal());
1125+
self.change_status(event.actor_status.clone());
1126+
Some(event)
1127+
}
1128+
_ => {
1129+
self.change_status(ActorStatus::Failed(err.to_string()));
1130+
Some(ActorSupervisionEvent::new(
1131+
self.cell.actor_id().clone(),
1132+
ActorStatus::Failed(err.to_string()),
1133+
None,
1134+
None,
1135+
))
1136+
}
1137+
}
1138+
}
11171139
};
11181140

11191141
if let Some(parent) = self.cell.maybe_unlink_parent() {
@@ -1141,7 +1163,6 @@ impl<A: Actor> Instance<A> {
11411163
self.proc.handle_supervision_event(event);
11421164
}
11431165
}
1144-
self.change_status(actor_status);
11451166
}
11461167

11471168
/// Runs the actor, and manages its supervision tree. When the function returns,
@@ -1179,10 +1200,16 @@ impl<A: Actor> Instance<A> {
11791200
}
11801201
};
11811202

1182-
if let Err(ref err) = result {
1183-
tracing::error!("{}: actor failure: {}", self.self_id(), err);
1203+
match &result {
1204+
Ok(_) => assert!(self.is_stopped()),
1205+
Err(err) => {
1206+
tracing::error!("{}: actor failure: {}", self.self_id(), err);
1207+
assert!(!self.is_terminal());
1208+
// Send Stopping instead of Failed, because we still need to
1209+
// unlink child actors.
1210+
self.change_status(ActorStatus::Stopping);
1211+
}
11841212
}
1185-
self.change_status(ActorStatus::Stopping);
11861213

11871214
// After this point, we know we won't spawn any more children,
11881215
// so we can safely read the current child keys.

0 commit comments

Comments
 (0)