Skip to content

Commit 1b63d8b

Browse files
avagingvisor-bot
authored andcommitted
Implement setns for pid namespaces
PiperOrigin-RevId: 776238354
1 parent 6d7f074 commit 1b63d8b

File tree

12 files changed

+176
-27
lines changed

12 files changed

+176
-27
lines changed

pkg/sentry/control/lifecycle.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,6 @@ func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error {
237237
ls.SetUnchecked(lt, limit)
238238
}
239239

240-
// Create a new pid namespace for the container. Each container must run
241-
// in its own pid namespace.
242-
pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace())
243-
244240
initArgs := kernel.CreateProcessArgs{
245241
Filename: args.Filename,
246242
Argv: args.Argv,
@@ -254,11 +250,14 @@ func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error {
254250
UTSNamespace: l.Kernel.RootUTSNamespace(),
255251
IPCNamespace: l.Kernel.RootIPCNamespace(),
256252
ContainerID: args.ContainerID,
257-
PIDNamespace: pidNs,
258253
}
259254

260255
ctx := initArgs.NewContext(l.Kernel)
261256

257+
// Create a new pid namespace for the container. Each container must run
258+
// in its own pid namespace.
259+
initArgs.PIDNamespace = l.Kernel.RootPIDNamespace().NewChild(ctx, l.Kernel, l.Kernel.RootUserNamespace())
260+
262261
// Import file descriptors.
263262
fdTable := l.Kernel.NewFDTable()
264263
defer fdTable.DecRef(ctx)

pkg/sentry/fsimpl/proc/task.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns
7878
"ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{
7979
"net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
8080
"mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS),
81-
"pid": fs.newPIDNamespaceSymlink(ctx, task, fs.NextIno()),
81+
"pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID),
8282
"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
8383
"ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC),
8484
"uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS),

pkg/sentry/fsimpl/proc/task_files.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,17 +1289,6 @@ func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task
12891289
return taskInode
12901290
}
12911291

1292-
func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
1293-
target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID())
1294-
1295-
inode := &namespaceSymlink{task: task}
1296-
// Note: credentials are overridden by taskOwnedInode.
1297-
inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
1298-
1299-
taskInode := &taskOwnedInode{Inode: inode, owner: task}
1300-
return taskInode
1301-
}
1302-
13031292
func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode {
13041293
// Namespace symlinks should contain the namespace name and the inode number
13051294
// for the namespace instance, so for example user:[123456]. We currently fake
@@ -1339,6 +1328,11 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
13391328
}
13401329
inode, _ := mntns.Refs.(*nsfs.Inode)
13411330
return inode
1331+
case linux.CLONE_NEWPID:
1332+
if pidns := t.GetPIDNamespace(); pidns != nil {
1333+
return pidns.GetInode()
1334+
}
1335+
return nil
13421336
default:
13431337
panic("unknown namespace")
13441338
}

pkg/sentry/fsimpl/testutil/kernel.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ func Boot() (*kernel.Kernel, error) {
106106
VdsoParams: params,
107107
RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
108108
RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
109-
PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
109+
RootPIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
110110
UnixSocketOpts: transport.UnixSocketOpts{},
111111
}); err != nil {
112112
return nil, fmt.Errorf("initializing kernel: %v", err)

pkg/sentry/kernel/kernel.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ type InitKernelArgs struct {
441441
// RootIPCNamespace is the root IPC namespace.
442442
RootIPCNamespace *IPCNamespace
443443

444-
// PIDNamespace is the root PID namespace.
445-
PIDNamespace *PIDNamespace
444+
// RootPIDNamespace is the root PID namespace.
445+
RootPIDNamespace *PIDNamespace
446446

447447
// MaxFDLimit specifies the maximum file descriptor number that can be
448448
// used by processes. If it is zero, the limit will be set to
@@ -473,7 +473,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
473473

474474
k.featureSet = args.FeatureSet
475475
k.timekeeper = args.Timekeeper
476-
k.tasks = newTaskSet(args.PIDNamespace)
476+
k.tasks = newTaskSet(args.RootPIDNamespace)
477477
k.rootUserNamespace = args.RootUserNamespace
478478
k.rootUTSNamespace = args.RootUTSNamespace
479479
k.rootIPCNamespace = args.RootIPCNamespace
@@ -540,6 +540,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
540540
k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
541541
k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
542542

543+
args.RootPIDNamespace.InitInode(ctx, k)
544+
543545
tmpfsOpts := vfs.GetFilesystemOptions{
544546
InternalData: tmpfs.FilesystemOpts{
545547
// See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT).
@@ -1911,6 +1913,7 @@ func (k *Kernel) Release() {
19111913
k.rootUTSNamespace.DecRef(ctx)
19121914
k.cleaupDevGofers()
19131915
k.mf.Destroy()
1916+
k.RootPIDNamespace().DecRef(ctx)
19141917
}
19151918

19161919
// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup

pkg/sentry/kernel/task_clone.go

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
224224
if t.childPIDNamespace != nil {
225225
pidns = t.childPIDNamespace
226226
} else if args.Flags&linux.CLONE_NEWPID != 0 {
227-
pidns = pidns.NewChild(userns)
227+
pidns = pidns.NewChild(t, t.k, userns)
228+
defer pidns.DecRef(t)
228229
}
229230

230231
tg := t.tg
@@ -277,6 +278,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
277278
// the cleanup for us.
278279
cu.Release()
279280
if err != nil {
281+
if args.Flags&linux.CLONE_THREAD == 0 {
282+
tg.Release(t)
283+
}
280284
return 0, nil, err
281285
}
282286

@@ -547,6 +551,36 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
547551
t.mu.Unlock()
548552
oldNS.DecRef(t)
549553
return nil
554+
case *PIDNamespace:
555+
if flags != 0 && flags != linux.CLONE_NEWPID {
556+
return linuxerr.EINVAL
557+
}
558+
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
559+
!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
560+
return linuxerr.EPERM
561+
}
562+
563+
// Allow setting the current or a child pid namespace.
564+
current := t.PIDNamespace()
565+
ancestor := ns
566+
for ; ancestor != nil; ancestor = ancestor.parent {
567+
if ancestor == current {
568+
break
569+
}
570+
}
571+
if ancestor == nil {
572+
return linuxerr.EINVAL
573+
}
574+
575+
oldNS := t.childPIDNamespace
576+
ns.IncRef()
577+
t.mu.Lock()
578+
t.childPIDNamespace = ns
579+
t.mu.Unlock()
580+
if oldNS != nil {
581+
oldNS.DecRef(t)
582+
}
583+
return nil
550584
default:
551585
return linuxerr.EINVAL
552586
}
@@ -611,7 +645,7 @@ func (t *Task) Unshare(flags int32) error {
611645
if !haveCapSysAdmin {
612646
return linuxerr.EPERM
613647
}
614-
t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
648+
t.childPIDNamespace = t.tg.pidns.NewChild(t, t.k, t.UserNamespace())
615649
}
616650
if flags&linux.CLONE_NEWNET != 0 {
617651
if !haveCapSysAdmin {

pkg/sentry/kernel/task_exit.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,11 +304,16 @@ func (*runExitMain) execute(t *Task) taskRunState {
304304
t.ipcns = nil
305305
netns := t.netns
306306
t.netns = nil
307+
childPIDNS := t.childPIDNamespace
308+
t.childPIDNamespace = nil
307309
t.mu.Unlock()
308310
mntns.DecRef(t)
309311
utsns.DecRef(t)
310312
ipcns.DecRef(t)
311313
netns.DecRef(t)
314+
if childPIDNS != nil {
315+
childPIDNS.DecRef(t)
316+
}
312317

313318
// If this is the last task to exit from the thread group, release the
314319
// thread group's resources.

pkg/sentry/kernel/thread_group.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ type ThreadGroup struct {
285285
// The new thread group isn't visible to the system until a task has been
286286
// created inside of it by a successful call to TaskSet.NewTask.
287287
func (k *Kernel) NewThreadGroup(pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
288+
pidns.IncRef()
288289
tg := &ThreadGroup{
289290
threadGroupNode: threadGroupNode{
290291
pidns: pidns,
@@ -364,6 +365,7 @@ func (tg *ThreadGroup) Release(ctx context.Context) {
364365
for _, it := range its {
365366
it.DestroyTimer()
366367
}
368+
tg.pidns.DecRef(ctx)
367369
}
368370

369371
// forEachChildThreadGroupLocked indicates over all child ThreadGroups.

pkg/sentry/kernel/threads.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ import (
1818
"fmt"
1919

2020
"gvisor.dev/gvisor/pkg/atomicbitops"
21+
"gvisor.dev/gvisor/pkg/context"
22+
"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
2123
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
2224
"gvisor.dev/gvisor/pkg/sync"
2325
"gvisor.dev/gvisor/pkg/waiter"
@@ -207,6 +209,8 @@ type PIDNamespace struct {
207209

208210
// pidNamespaceData contains additional per-PID-namespace data.
209211
extra pidNamespaceData
212+
213+
inode *nsfs.Inode
210214
}
211215

212216
func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
@@ -226,6 +230,11 @@ func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespa
226230
}
227231
}
228232

233+
// InitInode creates and sets a new nsfs.Inode.
234+
func (ns *PIDNamespace) InitInode(ctx context.Context, k *Kernel) {
235+
ns.inode = nsfs.NewInode(ctx, k.nsfsMount, ns)
236+
}
237+
229238
// lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID
230239
// namespace.
231240
//
@@ -239,10 +248,35 @@ func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
239248
return newPIDNamespace(nil, nil, userns)
240249
}
241250

251+
// GetInode returns the nsfs inode associated with the namespace.
252+
func (ns *PIDNamespace) GetInode() *nsfs.Inode {
253+
return ns.inode
254+
}
255+
256+
// IncRef increments the Namespace's refcount.
257+
func (ns *PIDNamespace) IncRef() {
258+
ns.inode.IncRef()
259+
}
260+
261+
// DecRef decrements the namespace's refcount.
262+
func (ns *PIDNamespace) DecRef(ctx context.Context) {
263+
ns.inode.DecRef(ctx)
264+
}
265+
266+
// Destroy implements nsfs.Namespace.Destroy.
267+
func (ns *PIDNamespace) Destroy(ctx context.Context) {}
268+
269+
// Type implements nsfs.Namespace.Type.
270+
func (ns *PIDNamespace) Type() string {
271+
return "pid"
272+
}
273+
242274
// NewChild returns a new, empty PID namespace that is a child of ns. Authority
243275
// over the new PID namespace is controlled by userns.
244-
func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
245-
return newPIDNamespace(ns.owner, ns, userns)
276+
func (ns *PIDNamespace) NewChild(ctx context.Context, k *Kernel, userns *auth.UserNamespace) *PIDNamespace {
277+
pidns := newPIDNamespace(ns.owner, ns, userns)
278+
pidns.InitInode(ctx, k)
279+
return pidns
246280
}
247281

248282
// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
@@ -538,6 +572,12 @@ func (t *Task) PIDNamespace() *PIDNamespace {
538572
return t.tg.pidns
539573
}
540574

575+
// GetPIDNamespace returns the PID namespace containing t.
576+
func (t *Task) GetPIDNamespace() *PIDNamespace {
577+
t.tg.pidns.IncRef()
578+
return t.tg.pidns
579+
}
580+
541581
// TaskSet returns the TaskSet containing t.
542582
func (t *Task) TaskSet() *TaskSet {
543583
return t.tg.pidns.owner

runsc/boot/loader.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,7 @@ func New(args Args) (*Loader, error) {
625625
VdsoParams: params,
626626
RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
627627
RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
628-
PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
628+
RootPIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
629629
MaxFDLimit: maxFDLimit,
630630
UnixSocketOpts: unixSocketOpts,
631631
}); err != nil {
@@ -1090,7 +1090,7 @@ func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid st
10901090
}
10911091
if pidns == nil {
10921092
log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path)
1093-
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
1093+
pidns = l.k.RootPIDNamespace().NewChild(l.k.SupervisorContext(), l.k, l.k.RootUserNamespace())
10941094
}
10951095
ep.pidnsPath = ns.Path
10961096
} else {

0 commit comments

Comments
 (0)