Skip to content

Commit d690038

Browse files
ayushr2gvisor-bot
authored andcommitted
Reap the gofer process in Container.Wait().
The linked issue is regarding `runsc run --detach=false` not waiting on the gofer (which is its child process). Because of this, if the container is killed externally via `runsc delete`, then the gofer process becomes a zombie. There is a deadlock because `runsc delete` is waiting for the gofer process to be reaped but the `runsc run` process is stuck waiting for a filesystem lock held by the `runsc delete` process before it can reap it in Container.Destroy(). The fix is to also attempt reaping the child gofer process in Container.Wait(). Notice that this is similar to what Sandbox does. Both Sandbox.Wait() and Sandbox.destroy() call Sandbox.waitForStopped() which reaps the child sandbox process. However, only Container.Destroy() => stop() => waitForStopped(). This change updates Container.Wait() to start a goroutine that waits on the gofer PID in the background when the gofer process is a child of the caller. This fixes the above deadlock because `runsc run` will reap the gofer before getting stuck waiting for the filesystem lock. Fixes #12051 PiperOrigin-RevId: 805669277
1 parent 2188c04 commit d690038

File tree

1 file changed

+31
-5
lines changed

1 file changed

+31
-5
lines changed

runsc/container/container.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,30 @@ func (c *Container) SandboxPid() int {
604604
// and wait returns immediately.
605605
func (c *Container) Wait() (unix.WaitStatus, error) {
606606
log.Debugf("Wait on container, cid: %s", c.ID)
607+
if c.goferIsChild {
608+
// The gofer process is a child of the current process, so we need to wait
609+
// on it and collect its zombie. Start a goroutine to reap c.GoferPid. Do
610+
// not block the main thread waiting for the gofer to exit. In certain
611+
// multi-container scenarios, the gofer may outlive its container. For
612+
// example, if a gofer-backed FD is donated by the application to another
613+
// container within the sandbox. The lifecycle of the gofer will be tied to
614+
// both containers, whichever lives longer.
615+
go func() {
616+
goferPid := c.GoferPid.Load()
617+
if goferPid == 0 {
618+
return
619+
}
620+
// The gofer process could be racily reaped by c.waitForStopped() so
621+
// ignore ECHILD errors.
622+
var status unix.WaitStatus
623+
if _, err := unix.Wait4(goferPid, &status, 0, nil); err == nil {
624+
log.Infof("Gofer process (PID=%d) reaped: exit code = %v", goferPid, status.ExitStatus())
625+
} else if !errors.Is(err, unix.ECHILD) {
626+
log.Warningf("error reaping the gofer process (PID=%d) from background goroutine: %v", goferPid, err)
627+
}
628+
c.GoferPid.Store(0)
629+
}()
630+
}
607631
ws, err := c.Sandbox.Wait(c.ID)
608632
if err == nil {
609633
// Wait succeeded, container is not running anymore.
@@ -1131,8 +1155,9 @@ func (c *Container) stop() error {
11311155
// Try killing gofer if it does not exit with container.
11321156
if goferPid := c.GoferPid.Load(); goferPid != 0 {
11331157
log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, goferPid)
1134-
if err := unix.Kill(goferPid, unix.SIGKILL); err != nil {
1135-
// The gofer may already be stopped, log the error.
1158+
// The gofer process may have been racily reaped by the goroutine from
1159+
// c.Wait() so ignore ESRCH errors.
1160+
if err := unix.Kill(goferPid, unix.SIGKILL); err != nil && !errors.Is(err, unix.ESRCH) {
11361161
log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, goferPid, err)
11371162
}
11381163
}
@@ -1170,9 +1195,10 @@ func (c *Container) waitForStopped() error {
11701195
}
11711196

11721197
if c.goferIsChild {
1173-
// The gofer process is a child of the current process,
1174-
// so we can wait it and collect its zombie.
1175-
if _, err := unix.Wait4(int(goferPid), nil, 0, nil); err != nil {
1198+
// The gofer process is a child of the current process, so we can wait it
1199+
// and collect its zombie. The gofer process could be racily reaped by the
1200+
// goroutine from c.Wait() so ignore ECHILD errors.
1201+
if _, err := unix.Wait4(int(goferPid), nil, 0, nil); err != nil && !errors.Is(err, unix.ECHILD) {
11761202
return fmt.Errorf("error waiting the gofer process: %v", err)
11771203
}
11781204
c.GoferPid.Store(0)

0 commit comments

Comments
 (0)