Add cleanup function to Actor trait and use it from PythonActor (#1836)

dulinriley · meta-codesync[bot] · commit b8d1524f4064 · 2025-11-13T16:54:22.000-08:00
Summary: Pull Request resolved: #1836 Fixes #1849 RFC: Add a "cleanup" for actors to run on stop. This method is invoked after all child actors are stopped, but before the current actor exits. If an exception is thrown, it will become an error at shutdown which will propagate. The cleanup method in python can be sync or async, and must match the syncness of endpoints. It takes one argument, which is an `Optional[Exception]`. If this was an abnormal exit, that exception is not None, specifically to the result of `ActorError::to_string` (wrapped in an exception object). Later on we may be able to preserve the original exception, if there is one. The motivation is from actors that want to call `dist.destroy_process_group()`, as that is one of the most frequent cleanup action in users of monarch. Actors should *not* call `stop()` on any actor or proc meshes they own. This will be handled automatically in the future, and they will have already been stopped by the time cleanup is invoked. This cleanup is per-actor, not per-proc. So if the cleanup is destroying process-wide resources (as does "destroy_process_group"), then the actor shouldn't be colocated with any other actors on the same proc using the same resource. If the cleanup takes too long, ignore the result and continue with stopping. ProcMesh::stop() already does a graceful stop of all actors, so this cleanup will be run automatically when proc meshes are stopped. Reviewed By: mariusae Differential Revision: D85624518 fbshipit-source-id: 172eeebf18eddc1a7f5928dcc31efd4cd9120287
diff --git a/hyperactor/src/actor.rs b/hyperactor/src/actor.rs
@@ -82,6 +82,25 @@ pub trait Actor: Sized + Send + Debug + 'static {
         Ok(())
     }
 
+    /// Cleanup things used by this actor before shutting down. Notably this function
+    /// is async and allows more complex cleanup. Simpler cleanup can be handled
+    /// by the impl Drop for this Actor.
+    /// If err is not None, it is the error that this actor is failing with. Any
+    /// errors returned by this function will be logged and ignored.
+    /// If err is None, any errors returned by this function will be propagated
+    /// as an ActorError.
+    /// This function is not called if there is a panic in the actor, as the
+    /// actor may be in an indeterminate state. It is also not called if the
+    /// process is killed, there is no atexit handler or signal handler.
+    async fn cleanup(
+        &mut self,
+        _this: &Instance<Self>,
+        _err: Option<&ActorError>,
+    ) -> Result<(), anyhow::Error> {
+        // Default implementation: no cleanup.
+        Ok(())
+    }
+
     /// Spawn a child actor, given a spawning capability (usually given by [`Instance`]).
     /// The spawned actor will be supervised by the parent (spawning) actor.
     async fn spawn(
@@ -343,6 +362,11 @@ impl ActorErrorKind {
         Self::Generic(format!("initialization error: {}", err))
     }
 
+    /// Error during actor cleanup.
+    pub fn cleanup(err: anyhow::Error) -> Self {
+        Self::Generic(format!("cleanup error: {}", err))
+    }
+
     /// An underlying mailbox error.
     pub fn mailbox(err: MailboxError) -> Self {
         Self::Generic(err.to_string())
diff --git a/hyperactor/src/config.rs b/hyperactor/src/config.rs
@@ -155,6 +155,14 @@ declare_attrs! {
     })
     pub attr STOP_ACTOR_TIMEOUT: Duration = Duration::from_secs(10);
 
+    /// Timeout used by proc for running the cleanup callback on an actor.
+    /// Should be less than the timeout for STOP_ACTOR_TIMEOUT.
+    @meta(CONFIG = ConfigAttr {
+        env_name: Some("HYPERACTOR_CLEANUP_TIMEOUT".to_string()),
+        py_name: None,
+    })
+    pub attr CLEANUP_TIMEOUT: Duration = Duration::from_secs(3);
+
     /// Heartbeat interval for remote allocator
     @meta(CONFIG = ConfigAttr {
         env_name: Some("HYPERACTOR_REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL".to_string()),
diff --git a/hyperactor/src/proc.rs b/hyperactor/src/proc.rs
@@ -1180,13 +1180,15 @@ impl<A: Actor> Instance<A> {
         // https://docs.rs/tokio/latest/tokio/task/struct.JoinError.html#method.is_panic
         // What we do here is just to catch it early so we can handle it.
 
+        let mut did_panic = false;
         let result = match AssertUnwindSafe(self.run(actor, &mut actor_loop_receivers, work_rx))
             .catch_unwind()
             .await
         {
             Ok(result) => result,
             Err(err) => {
                 // This is only the error message. Backtrace is not included.
+                did_panic = true;
                 let err_msg = err
                     .downcast_ref::<&str>()
                     .copied()
@@ -1252,8 +1254,41 @@ impl<A: Actor> Instance<A> {
                 }
             }
         }
-
-        result
+        // Run the actor cleanup function before the actor stops to delete
+        // resources. If it times out, continue with stopping the actor.
+        // Don't call it if there was a panic, because the actor may
+        // be in an invalid state and unable to access anything, for example
+        // the GIL.
+        let cleanup_result = if !did_panic {
+            let cleanup_timeout = config::global::get(config::CLEANUP_TIMEOUT);
+            match RealClock
+                .timeout(cleanup_timeout, actor.cleanup(self, result.as_ref().err()))
+                .await
+            {
+                Ok(Ok(x)) => Ok(x),
+                Ok(Err(e)) => Err(ActorError::new(self.self_id(), ActorErrorKind::cleanup(e))),
+                Err(e) => Err(ActorError::new(
+                    self.self_id(),
+                    ActorErrorKind::cleanup(e.into()),
+                )),
+            }
+        } else {
+            Ok(())
+        };
+        if let Err(ref actor_err) = result {
+            // The original result error takes precedence over the cleanup error,
+            // so make sure the cleanup error is still logged in that case.
+            if let Err(ref err) = cleanup_result {
+                tracing::warn!(
+                    cleanup_err = %err,
+                    %actor_err,
+                    "ignoring cleanup error after actor error",
+                );
+            }
+        }
+        // If the original exit was not an error, let cleanup errors be
+        // surfaced.
+        result.and(cleanup_result)
     }
 
     /// Initialize and run the actor until it fails or is stopped.
diff --git a/monarch_hyperactor/src/actor.rs b/monarch_hyperactor/src/actor.rs
@@ -22,6 +22,8 @@ use hyperactor::Named;
 use hyperactor::OncePortHandle;
 use hyperactor::PortHandle;
 use hyperactor::ProcId;
+use hyperactor::actor::ActorError;
+use hyperactor::attrs::Attrs;
 use hyperactor::mailbox::MessageEnvelope;
 use hyperactor::mailbox::Undeliverable;
 use hyperactor::message::Bind;
@@ -570,6 +572,63 @@ impl Actor for PythonActor {
         })?)
     }
 
+    async fn cleanup(
+        &mut self,
+        this: &Instance<Self>,
+        err: Option<&ActorError>,
+    ) -> anyhow::Result<()> {
+        // Calls the "__cleanup__" method on the python instance to allow the actor
+        // to control its own cleanup.
+        // No headers because this isn't in the context of a message.
+        let cx = Context::new(this, Attrs::new());
+        // Turn the ActorError into a representation of the error. We may not
+        // have an original exception object or traceback, so we just pass in
+        // the message.
+        let err_as_str = err.map(|e| e.to_string());
+        let future = Python::with_gil(|py| {
+            let py_cx = match self.instance {
+                Some(ref instance) => crate::context::PyContext::new(&cx, instance.clone_ref(py)),
+                None => {
+                    let py_instance: crate::context::PyInstance = this.into();
+                    crate::context::PyContext::new(
+                        &cx,
+                        py_instance
+                            .into_py_any(py)?
+                            .downcast_bound(py)
+                            .map_err(PyErr::from)?
+                            .clone()
+                            .unbind(),
+                    )
+                }
+            }
+            .into_bound_py_any(py)?;
+            let actor = self.actor.bind(py);
+            // Some tests don't use the Actor base class, so add this check
+            // to be defensive.
+            match actor.hasattr("__cleanup__") {
+                Ok(false) | Err(_) => {
+                    // No cleanup found, default to returning None
+                    return Ok(None);
+                }
+                _ => {}
+            }
+            let awaitable = actor
+                .call_method("__cleanup__", (&py_cx, err_as_str), None)
+                .map_err(|err| anyhow::Error::from(SerializablePyErr::from(py, &err)))?;
+            if awaitable.is_none() {
+                Ok(None)
+            } else {
+                pyo3_async_runtimes::into_future_with_locals(self.get_task_locals(py), awaitable)
+                    .map(Some)
+                    .map_err(anyhow::Error::from)
+            }
+        })?;
+        if let Some(future) = future {
+            future.await.map_err(anyhow::Error::from)?;
+        }
+        Ok(())
+    }
+
     async fn handle_undeliverable_message(
         &mut self,
         ins: &Instance<Self>,
diff --git a/python/monarch/_src/actor/actor_mesh.py b/python/monarch/_src/actor/actor_mesh.py
@@ -1145,6 +1145,34 @@ def __supervise__(self, cx: Context, *args: Any, **kwargs: Any) -> object:
             # propagated to the next owner.
             return None
 
+    async def __cleanup__(self, cx: Context, exc: str | Exception | None) -> None:
+        """Cleans up any resources owned by this Actor before stopping. Automatically
+        called even if there is an error"""
+        _context.set(cx)
+        instance = self.instance
+        if instance is None:
+            # If there is no instance, there's nothing to clean up, the actor
+            # was never constructed
+            return None
+
+        # Forward a call to supervise on this actor to the user-provided instance.
+        cleanup = getattr(instance, "__cleanup__", None)
+        if cleanup is None:
+            return None
+
+        if isinstance(exc, str):
+            # Wrap the string in an exception object so the main API of __cleanup__
+            # is to take an optional exception object.
+            # The raw string is used for wider compatibility with other error
+            # types for now.
+            exc = Exception(exc)
+
+        if inspect.iscoroutinefunction(cleanup):
+            return await cleanup(exc)
+        else:
+            with fake_sync_state():
+                return cleanup(exc)
+
     def __repr__(self) -> str:
         return f"_Actor(instance={self.instance!r})"
 
@@ -1232,6 +1260,7 @@ def __init__(
 
         async_endpoints = []
         sync_endpoints = []
+        async_cleanup = None
         for attr_name in dir(self._class):
             attr_value = getattr(self._class, attr_name, None)
             if isinstance(attr_value, EndpointProperty):
@@ -1255,13 +1284,28 @@ def __init__(
                     async_endpoints.append(attr_name)
                 else:
                     sync_endpoints.append(attr_name)
+            if attr_name == "__cleanup__" and attr_value is not None:
+                async_cleanup = inspect.iscoroutinefunction(attr_value)
 
         if sync_endpoints and async_endpoints:
             raise ValueError(
                 f"{self._class} mixes both async and sync endpoints."
                 "Synchronous endpoints cannot be mixed with async endpoints because they can cause the asyncio loop to deadlock if they wait."
                 f"sync: {sync_endpoints} async: {async_endpoints}"
             )
+        if sync_endpoints and async_cleanup:
+            raise ValueError(
+                f"{self._class} has sync endpoints, but an async __cleanup__. Make sure __cleanup__ is also synchronous."
+                "Synchronous endpoints cannot be mixed with async endpoints because they can cause the asyncio loop to deadlock if they wait."
+                f"sync: {sync_endpoints}"
+            )
+        # Check for False explicitly because None means there is no cleanup.
+        if async_endpoints and async_cleanup is False:
+            raise ValueError(
+                f"{self._class} has async endpoints, but a synchronous __cleanup__. Make sure __cleanup__ is also async."
+                "Synchronous endpoints cannot be mixed with async endpoints because they can cause the asyncio loop to deadlock if they wait."
+                f"sync: {sync_endpoints}"
+            )
 
     def __getattr__(self, attr: str) -> NotAnEndpoint:
         if attr in dir(self._class):
diff --git a/python/tests/test_cuda.py b/python/tests/test_cuda.py
@@ -9,13 +9,14 @@
 import os
 import sys
 import unittest
-from typing import Dict, List
+from typing import cast, Dict, List
 
 import cloudpickle
-import monarch.actor
 import torch
+import torch.distributed as dist
+from monarch._src.actor.actor_mesh import ActorMesh
 from monarch._src.actor.host_mesh import create_local_host_mesh, fake_in_process_host
-from monarch.actor import Actor, endpoint
+from monarch.actor import Actor, current_rank, current_size, endpoint, this_host
 
 
 class CudaInitTestActor(Actor):
@@ -46,6 +47,42 @@ async def is_cuda_initialized(self) -> bool:
         return self.cuda_initialized
 
 
+class TorchDistributedActor(Actor):
+    """Actor that initializes CUDA and checks environment variables"""
+
+    def __init__(self) -> None:
+        self.rank = int(current_rank()["gpus"])
+        self.world_size = int(current_size()["gpus"])
+        self.port = 29500
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(self.port)
+
+    @endpoint
+    def init_torch_distributed(self) -> None:
+        if not dist.is_initialized():
+            dist.init_process_group(
+                backend="nccl",
+                world_size=self.world_size,
+                rank=self.rank,
+            )
+
+    @endpoint
+    def is_initialized(self) -> bool:
+        return dist.is_initialized()
+
+    # Cleanup is a special function called automatically on actor stop.
+    def __cleanup__(self, exc: Exception | None) -> None:
+        self.logger.info(f"Cleanup called with exception: {exc}")
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+class IsTorchInitializedActor(Actor):
+    @endpoint
+    def is_initialized(self) -> bool:
+        return dist.is_initialized()
+
+
 class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
     """Test that the env vars are setup before cuda init"""
 
@@ -149,3 +186,16 @@ async def test_proc_mesh_with_dictionary_env(self) -> None:
                 env_vars.get("CUDA_DEVICE_MAX_CONNECTIONS"),
                 "1",
             )
+
+    async def test_cleanup_torch_distributed(self) -> None:
+        """Test that calling stop on the actor destroys the process group"""
+        proc_mesh = this_host().spawn_procs(per_host={"gpus": 1})
+
+        actor = proc_mesh.spawn("torch_init", TorchDistributedActor)
+        tester = proc_mesh.spawn("check", IsTorchInitializedActor)
+        await actor.init_torch_distributed.call_one()
+        self.assertTrue(await actor.is_initialized.call_one())
+        # Stop the actor and ensure cleanup is called, by using another actor
+        # on the same proc.
+        await cast(ActorMesh[TorchDistributedActor], actor).stop()
+        self.assertFalse(await tester.is_initialized.call_one())
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
@@ -1816,3 +1816,50 @@ def test_context_propagated_through_python_task_spawn_blocking():
     p = this_host().spawn_procs()
     a = p.spawn("test_pytokio_actor", TestPytokioActor)
     a.context_propagated_through_spawn_blocking.call().get()
+
+
+class ActorWithCleanup(Actor):
+    def __init__(self, counter: Counter) -> None:
+        self.counter = counter
+
+    @endpoint
+    def check(self) -> None:
+        pass
+
+    def __cleanup__(self, exc: Exception | None):
+        self.logger.info(f"Calling __cleanup__ on {self}, {exc=}")
+        self.counter.incr.call_one().get()
+
+
+class ActorWithAsyncCleanup(Actor):
+    def __init__(self, counter: Counter) -> None:
+        self.counter = counter
+
+    @endpoint
+    async def check(self) -> None:
+        pass
+
+    # Cleanup should match the async-ness of the other endpoints.
+    async def __cleanup__(self, exc: Exception | None):
+        self.logger.info(f"Calling __cleanup__ on {self}, {exc=}")
+        await self.counter.incr.call_one()
+
+
+def test_cleanup():
+    procs = this_host().spawn_procs(per_host={"gpus": 1})
+    counter = procs.spawn("counter", Counter, 0)
+    cleanup = procs.spawn("cleanup", ActorWithCleanup, counter)
+    # Call an endpoint to ensure it is constructed.
+    cleanup.check.call_one().get()
+    cast(ActorMesh[ActorWithCleanup], cleanup).stop().get()
+    assert counter.value.call_one().get() == 1
+
+
+def test_cleanup_async():
+    procs = this_host().spawn_procs(per_host={"gpus": 1})
+    counter = procs.spawn("counter", Counter, 0)
+    cleanup = procs.spawn("cleanup", ActorWithCleanup, counter)
+    # Call an endpoint to ensure it is constructed.
+    cleanup.check.call_one().get()
+    cast(ActorMesh[ActorWithCleanup], cleanup).stop().get()
+    assert counter.value.call_one().get() == 1