@@ -298,11 +298,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
298298 }
299299
300300 if userns != creds .UserNamespace {
301- if err := nt .SetUserNamespace (userns ); err != nil {
302- // This shouldn't be possible: userns was created from nt.creds, so
303- // nt should have CAP_SYS_ADMIN in userns.
304- panic ("Task.Clone: SetUserNamespace failed: " + err .Error ())
305- }
301+ nt .creds .Store (creds .ForkIntoUserNamespace (userns ))
306302 }
307303
308304 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
@@ -618,7 +614,6 @@ func (t *Task) Unshare(flags int32) error {
618614 if flags & (linux .CLONE_VM | linux .CLONE_SIGHAND ) != 0 {
619615 return linuxerr .EINVAL
620616 }
621- creds := t .Credentials ()
622617 if flags & linux .CLONE_THREAD != 0 {
623618 t .tg .signalHandlers .mu .Lock ()
624619 if t .tg .tasksCount != 1 {
@@ -629,98 +624,126 @@ func (t *Task) Unshare(flags int32) error {
629624 // This isn't racy because we're the only living task, and therefore
630625 // the only task capable of creating new ones, in our thread group.
631626 }
627+
628+ // Prepare new execution context.
629+ creds := t .Credentials ()
630+ var (
631+ newFSContext * FSContext
632+ newFDTable * FDTable
633+ newCreds bool
634+ newChildPIDNS * PIDNamespace
635+ newNetNS * inet.Namespace
636+ newUTSNS * UTSNamespace
637+ newIPCNS * IPCNamespace
638+ newMountNS * vfs.MountNamespace
639+ )
640+ defer func () {
641+ if newFSContext != nil {
642+ newFSContext .destroy (t )
643+ }
644+ if newFDTable != nil {
645+ newFDTable .DecRef (t )
646+ }
647+ if newNetNS != nil {
648+ newNetNS .DecRef (t )
649+ }
650+ if newUTSNS != nil {
651+ newUTSNS .DecRef (t )
652+ }
653+ if newIPCNS != nil {
654+ newIPCNS .DecRef (t )
655+ }
656+ if newMountNS != nil {
657+ newMountNS .DecRef (t )
658+ }
659+ }()
660+ if flags & linux .CLONE_FS != 0 || flags & linux .CLONE_NEWNS != 0 {
661+ newFSContext = t .FSContext ().Fork ()
662+ }
663+ if flags & linux .CLONE_FILES != 0 {
664+ newFDTable = t .fdTable .Fork (t , MaxFdLimit )
665+ }
632666 if flags & linux .CLONE_NEWUSER != 0 {
633667 if t .IsChrooted () {
634668 return linuxerr .EPERM
635669 }
670+ var err error
636671 newUserNS , err := creds .NewChildUserNamespace ()
637672 if err != nil {
638673 return err
639674 }
640- err = t .SetUserNamespace (newUserNS )
641- if err != nil {
642- return err
643- }
644- // Need to reload creds, because t.SetUserNamespace() changed task credentials.
645- creds = t .Credentials ()
675+ creds = t .Credentials ().ForkIntoUserNamespace (newUserNS )
676+ newCreds = true
646677 }
647- haveCapSysAdmin := t .HasCapability (linux .CAP_SYS_ADMIN )
648- if flags & linux .CLONE_NEWPID != 0 {
649- if ! haveCapSysAdmin {
678+ if flags & (linux .CLONE_NEWPID | linux .CLONE_NEWNET | linux .CLONE_NEWUTS | linux .CLONE_NEWIPC | linux .CLONE_NEWNS ) != 0 {
679+ if ! creds .HasCapability (linux .CAP_SYS_ADMIN ) {
650680 return linuxerr .EPERM
651681 }
652- t .childPIDNamespace = t .tg .pidns .NewChild (t , t .k , t .UserNamespace ())
682+ }
683+ if flags & linux .CLONE_NEWPID != 0 {
684+ newChildPIDNS = t .tg .pidns .NewChild (t , t .k , creds .UserNamespace )
653685 }
654686 if flags & linux .CLONE_NEWNET != 0 {
655- if ! haveCapSysAdmin {
656- return linuxerr .EPERM
657- }
658- netns := t .NetworkNamespace ()
659- netns = inet .NewNamespace (netns , t .UserNamespace ())
660- netnsInode := nsfs .NewInode (t , t .k .nsfsMount , netns )
661- netns .SetInode (netnsInode )
662- t .mu .Lock ()
663- oldNetns := t .netns
664- t .netns = netns
665- t .mu .Unlock ()
666- oldNetns .DecRef (t )
687+ newNetNS = inet .NewNamespace (t .netns , creds .UserNamespace )
688+ newNetNS .SetInode (nsfs .NewInode (t , t .k .nsfsMount , newNetNS ))
667689 }
668-
669- cu := cleanup.Cleanup {}
670- // All cu actions has to be executed after releasing t.mu.
671- defer cu .Clean ()
672- t .mu .Lock ()
673- defer t .mu .Unlock ()
674690 if flags & linux .CLONE_NEWUTS != 0 {
675- if ! haveCapSysAdmin {
676- return linuxerr .EPERM
677- }
678- // Note that this must happen after NewUserNamespace, so the
679- // new user namespace is used if there is one.
680- oldUTSNS := t .utsns
681- t .utsns = t .utsns .Clone (creds .UserNamespace )
682- t .utsns .SetInode (nsfs .NewInode (t , t .k .nsfsMount , t .utsns ))
683- cu .Add (func () { oldUTSNS .DecRef (t ) })
691+ newUTSNS = t .utsns .Clone (creds .UserNamespace )
692+ newUTSNS .SetInode (nsfs .NewInode (t , t .k .nsfsMount , newUTSNS ))
684693 }
685694 if flags & linux .CLONE_NEWIPC != 0 {
686- if ! haveCapSysAdmin {
687- return linuxerr .EPERM
695+ newIPCNS = NewIPCNamespace (creds .UserNamespace )
696+ newIPCNS .InitPosixQueues (t , t .k .VFS (), creds )
697+ newIPCNS .SetInode (nsfs .NewInode (t , t .k .nsfsMount , newIPCNS ))
698+ }
699+ if flags & linux .CLONE_NEWNS != 0 {
700+ fsContext := newFSContext
701+ if fsContext == nil {
702+ fsContext = t .FSContext ()
703+ }
704+ var err error
705+ newMountNS , err = t .k .vfs .CloneMountNamespace (t , creds .UserNamespace , t .mountNamespace , & fsContext .root , & fsContext .cwd , t .k )
706+ if err != nil {
707+ return err
688708 }
689- // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
690- // namespace"
691- oldIPCNS := t .ipcns
692- t .ipcns = NewIPCNamespace (creds .UserNamespace )
693- t .ipcns .InitPosixQueues (t , t .k .VFS (), creds )
694- t .ipcns .SetInode (nsfs .NewInode (t , t .k .nsfsMount , t .ipcns ))
695- cu .Add (func () { oldIPCNS .DecRef (t ) })
696709 }
697- if flags & linux .CLONE_FILES != 0 {
698- oldFDTable := t .fdTable
699- t .fdTable = oldFDTable .Fork (t , MaxFdLimit )
700- cu .Add (func () { oldFDTable .DecRef (t ) })
710+
711+ // Switch to new execution context. Store replaced resources in new* so
712+ // that they're cleaned up by the deferred function.
713+ if newCreds {
714+ t .creds .Store (creds )
701715 }
702- if flags & linux .CLONE_FS != 0 || flags & linux .CLONE_NEWNS != 0 {
716+ t .mu .Lock ()
717+ defer t .mu .Unlock ()
718+ if newFSContext != nil {
703719 oldFSContext := t .FSContext ()
704720 // unshareFromTask() lowers the old fs context's ref count, but its for us to
705721 // destroy it if there are no other references.
706- if oldFSContext .unshareFromTask (t , oldFSContext .Fork ()) {
707- // destroy() requires t.mu to not be held, hence the deferral.
708- cu .Add (func () { oldFSContext .destroy (t ) })
722+ if oldFSContext .unshareFromTask (t , newFSContext ) {
723+ newFSContext = oldFSContext
724+ } else {
725+ newFSContext = nil
709726 }
710727 }
711- if flags & linux .CLONE_NEWNS != 0 {
712- if ! haveCapSysAdmin {
713- return linuxerr .EPERM
714- }
715- oldMountNS := t .mountNamespace
716- fsContext := t .FSContext ()
717- mntns , err := t .k .vfs .CloneMountNamespace (t , creds .UserNamespace , oldMountNS , & fsContext .root , & fsContext .cwd , t .k )
718- if err != nil {
719- return err
720- }
721- t .mountNamespace = mntns
722- cu .Add (func () { oldMountNS .DecRef (t ) })
728+ if newFDTable != nil {
729+ t .fdTable , newFDTable = newFDTable , t .fdTable
723730 }
731+ if newChildPIDNS != nil {
732+ t .childPIDNamespace = newChildPIDNS
733+ }
734+ if newNetNS != nil {
735+ t .netns , newNetNS = newNetNS , t .netns
736+ }
737+ if newUTSNS != nil {
738+ t .utsns , newUTSNS = newUTSNS , t .utsns
739+ }
740+ if newIPCNS != nil {
741+ t .ipcns , newIPCNS = newIPCNS , t .ipcns
742+ }
743+ if newMountNS != nil {
744+ t .mountNamespace , newMountNS = newMountNS , t .mountNamespace
745+ }
746+
724747 return nil
725748}
726749
0 commit comments