Skip to content

Commit 9daae06

Browse files
nixprimegvisor-bot
authored andcommitted
sentry/kernel: unshare atomically and without holding Task.mu unnecessarily
Both changes are consistent with Linux's kernel/fork.c:ksys_unshare(). PiperOrigin-RevId: 821961783
1 parent 61f4c77 commit 9daae06

File tree

3 files changed

+117
-135
lines changed

3 files changed

+117
-135
lines changed

pkg/sentry/kernel/auth/credentials.go

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,28 @@ func (c *Credentials) Fork() *Credentials {
152152
return nc
153153
}
154154

155+
// ForkIntoUserNamespace returns a copy of c after its caller has entered user
156+
// namespace ns.
157+
func (c *Credentials) ForkIntoUserNamespace(ns *UserNamespace) *Credentials {
158+
nc := c.Fork()
159+
nc.UserNamespace = ns
160+
// "The child process created by clone(2) with the CLONE_NEWUSER flag
161+
// starts out with a complete set of capabilities in the new user
162+
// namespace. Likewise, a process that creates a new user namespace using
163+
// unshare(2) or joins an existing user namespace using setns(2) gains a
164+
// full set of capabilities in that namespace."
165+
nc.PermittedCaps = AllCapabilities
166+
nc.InheritableCaps = 0
167+
nc.EffectiveCaps = AllCapabilities
168+
nc.BoundingCaps = AllCapabilities
169+
// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
170+
// flag sets the "securebits" flags (see capabilities(7)) to their default
171+
// values (all flags disabled) in the child (for clone(2)) or caller (for
172+
// unshare(2), or setns(2)." - user_namespaces(7)
173+
nc.KeepCaps = false
174+
return nc
175+
}
176+
155177
// InGroup returns true if c is in group kgid. Compare Linux's
156178
// kernel/groups.c:in_group_p().
157179
func (c *Credentials) InGroup(kgid KGID) bool {
@@ -242,34 +264,6 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
242264
return NoID, linuxerr.EPERM
243265
}
244266

245-
// SetUID translates the provided uid to the root user namespace and updates c's
246-
// uids to it. This performs no permissions or capabilities checks, the caller
247-
// is responsible for ensuring the calling context is permitted to modify c.
248-
func (c *Credentials) SetUID(uid UID) error {
249-
kuid := c.UserNamespace.MapToKUID(uid)
250-
if !kuid.Ok() {
251-
return linuxerr.EINVAL
252-
}
253-
c.RealKUID = kuid
254-
c.EffectiveKUID = kuid
255-
c.SavedKUID = kuid
256-
return nil
257-
}
258-
259-
// SetGID translates the provided gid to the root user namespace and updates c's
260-
// gids to it. This performs no permissions or capabilities checks, the caller
261-
// is responsible for ensuring the calling context is permitted to modify c.
262-
func (c *Credentials) SetGID(gid GID) error {
263-
kgid := c.UserNamespace.MapToKGID(gid)
264-
if !kgid.Ok() {
265-
return linuxerr.EINVAL
266-
}
267-
c.RealKGID = kgid
268-
c.EffectiveKGID = kgid
269-
c.SavedKGID = kgid
270-
return nil
271-
}
272-
273267
// LoadSeccheckData sets credential data based on mask.
274268
func (c *Credentials) LoadSeccheckData(mask seccheck.FieldMask, info *pb.ContextData) {
275269
if mask.Contains(seccheck.FieldCtxtCredentials) {

pkg/sentry/kernel/task_clone.go

Lines changed: 95 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
298298
}
299299

300300
if userns != creds.UserNamespace {
301-
if err := nt.SetUserNamespace(userns); err != nil {
302-
// This shouldn't be possible: userns was created from nt.creds, so
303-
// nt should have CAP_SYS_ADMIN in userns.
304-
panic("Task.Clone: SetUserNamespace failed: " + err.Error())
305-
}
301+
nt.creds.Store(creds.ForkIntoUserNamespace(userns))
306302
}
307303

308304
// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
@@ -618,7 +614,6 @@ func (t *Task) Unshare(flags int32) error {
618614
if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
619615
return linuxerr.EINVAL
620616
}
621-
creds := t.Credentials()
622617
if flags&linux.CLONE_THREAD != 0 {
623618
t.tg.signalHandlers.mu.Lock()
624619
if t.tg.tasksCount != 1 {
@@ -629,98 +624,126 @@ func (t *Task) Unshare(flags int32) error {
629624
// This isn't racy because we're the only living task, and therefore
630625
// the only task capable of creating new ones, in our thread group.
631626
}
627+
628+
// Prepare new execution context.
629+
creds := t.Credentials()
630+
var (
631+
newFSContext *FSContext
632+
newFDTable *FDTable
633+
newCreds bool
634+
newChildPIDNS *PIDNamespace
635+
newNetNS *inet.Namespace
636+
newUTSNS *UTSNamespace
637+
newIPCNS *IPCNamespace
638+
newMountNS *vfs.MountNamespace
639+
)
640+
defer func() {
641+
if newFSContext != nil {
642+
newFSContext.destroy(t)
643+
}
644+
if newFDTable != nil {
645+
newFDTable.DecRef(t)
646+
}
647+
if newNetNS != nil {
648+
newNetNS.DecRef(t)
649+
}
650+
if newUTSNS != nil {
651+
newUTSNS.DecRef(t)
652+
}
653+
if newIPCNS != nil {
654+
newIPCNS.DecRef(t)
655+
}
656+
if newMountNS != nil {
657+
newMountNS.DecRef(t)
658+
}
659+
}()
660+
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
661+
newFSContext = t.FSContext().Fork()
662+
}
663+
if flags&linux.CLONE_FILES != 0 {
664+
newFDTable = t.fdTable.Fork(t, MaxFdLimit)
665+
}
632666
if flags&linux.CLONE_NEWUSER != 0 {
633667
if t.IsChrooted() {
634668
return linuxerr.EPERM
635669
}
670+
var err error
636671
newUserNS, err := creds.NewChildUserNamespace()
637672
if err != nil {
638673
return err
639674
}
640-
err = t.SetUserNamespace(newUserNS)
641-
if err != nil {
642-
return err
643-
}
644-
// Need to reload creds, because t.SetUserNamespace() changed task credentials.
645-
creds = t.Credentials()
675+
creds = t.Credentials().ForkIntoUserNamespace(newUserNS)
676+
newCreds = true
646677
}
647-
haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
648-
if flags&linux.CLONE_NEWPID != 0 {
649-
if !haveCapSysAdmin {
678+
if flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC|linux.CLONE_NEWNS) != 0 {
679+
if !creds.HasCapability(linux.CAP_SYS_ADMIN) {
650680
return linuxerr.EPERM
651681
}
652-
t.childPIDNamespace = t.tg.pidns.NewChild(t, t.k, t.UserNamespace())
682+
}
683+
if flags&linux.CLONE_NEWPID != 0 {
684+
newChildPIDNS = t.tg.pidns.NewChild(t, t.k, creds.UserNamespace)
653685
}
654686
if flags&linux.CLONE_NEWNET != 0 {
655-
if !haveCapSysAdmin {
656-
return linuxerr.EPERM
657-
}
658-
netns := t.NetworkNamespace()
659-
netns = inet.NewNamespace(netns, t.UserNamespace())
660-
netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
661-
netns.SetInode(netnsInode)
662-
t.mu.Lock()
663-
oldNetns := t.netns
664-
t.netns = netns
665-
t.mu.Unlock()
666-
oldNetns.DecRef(t)
687+
newNetNS = inet.NewNamespace(t.netns, creds.UserNamespace)
688+
newNetNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newNetNS))
667689
}
668-
669-
cu := cleanup.Cleanup{}
670-
// All cu actions has to be executed after releasing t.mu.
671-
defer cu.Clean()
672-
t.mu.Lock()
673-
defer t.mu.Unlock()
674690
if flags&linux.CLONE_NEWUTS != 0 {
675-
if !haveCapSysAdmin {
676-
return linuxerr.EPERM
677-
}
678-
// Note that this must happen after NewUserNamespace, so the
679-
// new user namespace is used if there is one.
680-
oldUTSNS := t.utsns
681-
t.utsns = t.utsns.Clone(creds.UserNamespace)
682-
t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns))
683-
cu.Add(func() { oldUTSNS.DecRef(t) })
691+
newUTSNS = t.utsns.Clone(creds.UserNamespace)
692+
newUTSNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUTSNS))
684693
}
685694
if flags&linux.CLONE_NEWIPC != 0 {
686-
if !haveCapSysAdmin {
687-
return linuxerr.EPERM
695+
newIPCNS = NewIPCNamespace(creds.UserNamespace)
696+
newIPCNS.InitPosixQueues(t, t.k.VFS(), creds)
697+
newIPCNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newIPCNS))
698+
}
699+
if flags&linux.CLONE_NEWNS != 0 {
700+
fsContext := newFSContext
701+
if fsContext == nil {
702+
fsContext = t.FSContext()
703+
}
704+
var err error
705+
newMountNS, err = t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, t.mountNamespace, &fsContext.root, &fsContext.cwd, t.k)
706+
if err != nil {
707+
return err
688708
}
689-
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
690-
// namespace"
691-
oldIPCNS := t.ipcns
692-
t.ipcns = NewIPCNamespace(creds.UserNamespace)
693-
t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
694-
t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
695-
cu.Add(func() { oldIPCNS.DecRef(t) })
696709
}
697-
if flags&linux.CLONE_FILES != 0 {
698-
oldFDTable := t.fdTable
699-
t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
700-
cu.Add(func() { oldFDTable.DecRef(t) })
710+
711+
// Switch to new execution context. Store replaced resources in new* so
712+
// that they're cleaned up by the deferred function.
713+
if newCreds {
714+
t.creds.Store(creds)
701715
}
702-
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
716+
t.mu.Lock()
717+
defer t.mu.Unlock()
718+
if newFSContext != nil {
703719
oldFSContext := t.FSContext()
704720
// unshareFromTask() lowers the old fs context's ref count, but its for us to
705721
// destroy it if there are no other references.
706-
if oldFSContext.unshareFromTask(t, oldFSContext.Fork()) {
707-
// destroy() requires t.mu to not be held, hence the deferral.
708-
cu.Add(func() { oldFSContext.destroy(t) })
722+
if oldFSContext.unshareFromTask(t, newFSContext) {
723+
newFSContext = oldFSContext
724+
} else {
725+
newFSContext = nil
709726
}
710727
}
711-
if flags&linux.CLONE_NEWNS != 0 {
712-
if !haveCapSysAdmin {
713-
return linuxerr.EPERM
714-
}
715-
oldMountNS := t.mountNamespace
716-
fsContext := t.FSContext()
717-
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &fsContext.root, &fsContext.cwd, t.k)
718-
if err != nil {
719-
return err
720-
}
721-
t.mountNamespace = mntns
722-
cu.Add(func() { oldMountNS.DecRef(t) })
728+
if newFDTable != nil {
729+
t.fdTable, newFDTable = newFDTable, t.fdTable
723730
}
731+
if newChildPIDNS != nil {
732+
t.childPIDNamespace = newChildPIDNS
733+
}
734+
if newNetNS != nil {
735+
t.netns, newNetNS = newNetNS, t.netns
736+
}
737+
if newUTSNS != nil {
738+
t.utsns, newUTSNS = newUTSNS, t.utsns
739+
}
740+
if newIPCNS != nil {
741+
t.ipcns, newIPCNS = newIPCNS, t.ipcns
742+
}
743+
if newMountNS != nil {
744+
t.mountNamespace, newMountNS = newMountNS, t.mountNamespace
745+
}
746+
724747
return nil
725748
}
726749

pkg/sentry/kernel/task_identity.go

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -419,41 +419,6 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error {
419419
return nil
420420
}
421421

422-
// SetUserNamespace attempts to move c into ns.
423-
//
424-
// Preconditions: The caller must be running on the task goroutine.
425-
func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
426-
creds := t.Credentials()
427-
// "A process reassociating itself with a user namespace must have the
428-
// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
429-
//
430-
// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
431-
// in ns (by rule 3 in auth.Credentials.HasCapability).
432-
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
433-
return linuxerr.EPERM
434-
}
435-
436-
creds = creds.Fork() // The credentials object is immutable. See doc for creds.
437-
creds.UserNamespace = ns
438-
// "The child process created by clone(2) with the CLONE_NEWUSER flag
439-
// starts out with a complete set of capabilities in the new user
440-
// namespace. Likewise, a process that creates a new user namespace using
441-
// unshare(2) or joins an existing user namespace using setns(2) gains a
442-
// full set of capabilities in that namespace."
443-
creds.PermittedCaps = auth.AllCapabilities
444-
creds.InheritableCaps = 0
445-
creds.EffectiveCaps = auth.AllCapabilities
446-
creds.BoundingCaps = auth.AllCapabilities
447-
// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
448-
// flag sets the "securebits" flags (see capabilities(7)) to their default
449-
// values (all flags disabled) in the child (for clone(2)) or caller (for
450-
// unshare(2), or setns(2)." - user_namespaces(7)
451-
creds.KeepCaps = false
452-
t.creds.Store(creds)
453-
454-
return nil
455-
}
456-
457422
// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
458423
//
459424
// Preconditions: The caller must be running on the task goroutine.

0 commit comments

Comments
 (0)