Skip to content

Commit 1941bc6

Browse files
shailend-ggvisor-bot
authored andcommitted
Fix race between unshare(2) and execve(2).
There are two issues being fixed: 1) checkAndPreventSharingOutsideTG() was accessing t.fsContext without grabbing t.mu and thus causing a data race with a concurrent unshare and 2) the concurrent unshare might present an incomplete view of the shared fscontext and cause execve to incorrectly conclude that it was shared outside the thread group. Reported-by: syzbot+e95172bececeffb258a9@syzkaller.appspotmail.com Reported-by: syzbot+f834131c3fbdeec4b560@syzkaller.appspotmail.com PiperOrigin-RevId: 800327461
1 parent fd37091 commit 1941bc6

File tree

10 files changed

+86
-35
lines changed

10 files changed

+86
-35
lines changed

pkg/sentry/kernel/BUILD

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@ go_template_instance(
4040
},
4141
)
4242

43+
go_template_instance(
44+
name = "atomicptr_fscontext",
45+
out = "atomicptr_fscontext_unsafe.go",
46+
package = "kernel",
47+
prefix = "fsContext",
48+
template = "//pkg/sync/atomicptr:generic_atomicptr",
49+
types = {
50+
"Value": "fsContext",
51+
},
52+
)
53+
4354
declare_mutex(
4455
name = "user_counters_mutex",
4556
out = "user_counters_mutex.go",

pkg/sentry/kernel/fs_context.go

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,20 @@ func NewFSContext(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
6161
return &f
6262
}
6363

64+
// destroy destroys the FSContext.
65+
//
66+
// Preconditions: f must have no refcount.
67+
func (f *FSContext) destroy(ctx context.Context) {
68+
// Hold f.mu so that we don't race with RootDirectory() and
69+
// WorkingDirectory().
70+
f.mu.Lock()
71+
defer f.mu.Unlock()
72+
f.root.DecRef(ctx)
73+
f.root = vfs.VirtualDentry{}
74+
f.cwd.DecRef(ctx)
75+
f.cwd = vfs.VirtualDentry{}
76+
}
77+
6478
// DecRef implements RefCounter.DecRef.
6579
//
6680
// When f reaches zero references, DecRef will be called on both root and cwd
@@ -71,15 +85,7 @@ func NewFSContext(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
7185
// proc files or other mechanisms.
7286
func (f *FSContext) DecRef(ctx context.Context) {
7387
f.FSContextRefs.DecRef(func() {
74-
// Hold f.mu so that we don't race with RootDirectory() and
75-
// WorkingDirectory().
76-
f.mu.Lock()
77-
defer f.mu.Unlock()
78-
79-
f.root.DecRef(ctx)
80-
f.root = vfs.VirtualDentry{}
81-
f.cwd.DecRef(ctx)
82-
f.cwd = vfs.VirtualDentry{}
88+
f.destroy(ctx)
8389
})
8490
}
8591

@@ -202,7 +208,7 @@ func (f *FSContext) checkAndPreventSharingOutsideTG(tg *ThreadGroup) bool {
202208

203209
tgCount := int64(0)
204210
tg.ForEachTask(func(t *Task) bool {
205-
if t.fsContext == f {
211+
if t.FSContext() == f {
206212
tgCount++
207213
}
208214
return true
@@ -233,3 +239,20 @@ func (f *FSContext) share() bool {
233239
f.IncRef()
234240
return true
235241
}
242+
243+
// unshareFromTask removes the FSContext f from the given Task t and replaces it with newF.
244+
// It returns a bool indicating whether f needs to be destroyed.
245+
246+
// This func operates without compromising a concurrent checkAndPreventSharingOutsideTG(): t's
247+
// association with f is severed atomically by holding f.mu, allowing the concurrent func to
248+
// correctly ascribe extra ref counts to tasks outside of t's thread group.
249+
//
250+
// Preconditions: The caller must be on the task goroutine and must hold t.mu.
251+
func (f *FSContext) unshareFromTask(t *Task, newF *FSContext) bool {
252+
f.mu.Lock()
253+
defer f.mu.Unlock()
254+
t.fsContext.Store(newF)
255+
destroy := false
256+
f.FSContextRefs.DecRef(func() { destroy = true })
257+
return destroy
258+
}

pkg/sentry/kernel/kernel.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,9 +1993,9 @@ func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualD
19931993
k.tasks.mu.RLock()
19941994
oldRootDecRefs := 0
19951995
k.tasks.forEachTaskLocked(func(t *Task) {
1996-
t.mu.Lock()
1996+
t.mu.Lock() // To prevent t's task goroutine from unsharing its fsContext from under us.
19971997
defer t.mu.Unlock()
1998-
if fsc := t.fsContext; fsc != nil {
1998+
if fsc := t.FSContext(); fsc != nil {
19991999
fsc.mu.Lock()
20002000
defer fsc.mu.Unlock()
20012001
if fsc.root == oldRoot {

pkg/sentry/kernel/kernel_state.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,16 @@ func (t *Task) loadSeccomp(_ context.Context, seccompData *taskSeccomp) {
6767
t.seccomp.Store(seccompData)
6868
}
6969

70+
// saveFsContext is invoked by stateify.
71+
func (t *Task) saveFsContext() *FSContext {
72+
return t.fsContext.Load()
73+
}
74+
75+
// loadFsContext is invoked by stateify.
76+
func (t *Task) loadFsContext(_ context.Context, fsContext *FSContext) {
77+
t.fsContext.Store(fsContext)
78+
}
79+
7080
// saveAppCPUClockLast is invoked by stateify.
7181
func (tg *ThreadGroup) saveAppCPUClockLast() *Task {
7282
return tg.appCPUClockLast.Load()

pkg/sentry/kernel/task.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,9 @@ type Task struct {
291291

292292
// fsContext is the task's filesystem context.
293293
//
294-
// fsContext is protected by mu, and is owned by the task goroutine.
295-
fsContext *FSContext
294+
// fsContext is protected by mu, and is owned by the task goroutine. It can be read without
295+
// synchronization using FSContext().
296+
fsContext atomic.Pointer[FSContext] `state:".(*FSContext)"`
296297

297298
// fdTable is the task's file descriptor table.
298299
//
@@ -750,7 +751,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
750751
func (t *Task) IsChrooted() bool {
751752
realRoot := t.mountNamespace.Root(t)
752753
defer realRoot.DecRef(t)
753-
root := t.fsContext.RootDirectory()
754+
root := t.FSContext().RootDirectory()
754755
defer root.DecRef(t)
755756
return root != realRoot
756757
}
@@ -766,10 +767,11 @@ func (t *Task) TaskImage() *TaskImage {
766767
// FSContext returns t's FSContext. FSContext does not take an additional
767768
// reference on the returned FSContext.
768769
//
769-
// Precondition: The caller must be running on the task goroutine, or t.mu must
770-
// be locked.
770+
// Precondition: No synchronization is needed for just read access. If the caller instead intends to
771+
// modify the contents of the FSContext, it must either be running on the task goroutine or have
772+
// t.mu locked.
771773
func (t *Task) FSContext() *FSContext {
772-
return t.fsContext
774+
return t.fsContext.Load()
773775
}
774776

775777
// FDTable returns t's FDTable. FDMTable does not take an additional reference

pkg/sentry/kernel/task_clone.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
192192

193193
var fsContext *FSContext
194194
if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
195-
fsContext = t.fsContext.Fork()
195+
fsContext = t.FSContext().Fork()
196196
} else {
197-
fsContext = t.fsContext
197+
fsContext = t.FSContext()
198198
if !fsContext.share() {
199199
// Linux fails clone with EAGAIN if there is a concurrent execve, see kernel/fork.c:copy_fs().
200200
return 0, nil, linuxerr.EAGAIN
@@ -518,7 +518,7 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
518518
!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
519519
return linuxerr.EPERM
520520
}
521-
oldFSContext := t.fsContext
521+
oldFSContext := t.FSContext()
522522
// The current task has to be an exclusive owner of its fs context.
523523
if oldFSContext.ReadRefs() != 1 {
524524
return linuxerr.EINVAL
@@ -535,7 +535,7 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
535535
ns.IncRef()
536536
t.mu.Lock()
537537
t.mountNamespace = ns
538-
t.fsContext = fsContext
538+
t.fsContext.Store(fsContext)
539539
t.mu.Unlock()
540540
oldNS.DecRef(t)
541541
oldFSContext.DecRef(t)
@@ -671,7 +671,6 @@ func (t *Task) Unshare(flags int32) error {
671671
defer cu.Clean()
672672
t.mu.Lock()
673673
defer t.mu.Unlock()
674-
// Can't defer unlock: DecRefs must occur without holding t.mu.
675674
if flags&linux.CLONE_NEWUTS != 0 {
676675
if !haveCapSysAdmin {
677676
return linuxerr.EPERM
@@ -701,16 +700,21 @@ func (t *Task) Unshare(flags int32) error {
701700
cu.Add(func() { oldFDTable.DecRef(t) })
702701
}
703702
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
704-
oldFSContext := t.fsContext
705-
t.fsContext = oldFSContext.Fork()
706-
cu.Add(func() { oldFSContext.DecRef(t) })
703+
oldFSContext := t.FSContext()
704+
// unshareFromTask() lowers the old fs context's ref count, but its for us to
705+
// destroy it if there are no other references.
706+
if oldFSContext.unshareFromTask(t, oldFSContext.Fork()) {
707+
// destroy() requires t.mu to not be held, hence the deferral.
708+
cu.Add(func() { oldFSContext.destroy(t) })
709+
}
707710
}
708711
if flags&linux.CLONE_NEWNS != 0 {
709712
if !haveCapSysAdmin {
710713
return linuxerr.EPERM
711714
}
712715
oldMountNS := t.mountNamespace
713-
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k)
716+
fsContext := t.FSContext()
717+
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &fsContext.root, &fsContext.cwd, t.k)
714718
if err != nil {
715719
return err
716720
}

pkg/sentry/kernel/task_context.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func (t *Task) contextValue(key any, isTaskGoroutine bool) any {
9696
t.mu.Lock()
9797
defer t.mu.Unlock()
9898
}
99-
return t.fsContext.RootDirectory()
99+
return t.FSContext().RootDirectory()
100100
case vfs.CtxMountNamespace:
101101
if !isTaskGoroutine {
102102
t.mu.Lock()

pkg/sentry/kernel/task_exec.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,15 +144,16 @@ func (r *runExecveAfterExecveCredsLock) execute(t *Task) taskRunState {
144144
return (*runInterrupt)(nil)
145145
}
146146

147-
root := t.FSContext().RootDirectory()
147+
fsContext := t.FSContext()
148+
root := fsContext.RootDirectory()
148149
defer root.DecRef(t)
149-
wd := t.FSContext().WorkingDirectory()
150+
wd := fsContext.WorkingDirectory()
150151
defer wd.DecRef(t)
151152

152153
// Cannot gain privileges if the ptracer's capabilities prevent it.
153154
stopPrivGain := t.shouldStopPrivGainDueToPtracerLocked()
154155
// Cannot gain privileges if the FSContext is shared outside of our thread group.
155-
if t.fsContext.checkAndPreventSharingOutsideTG(t.tg) {
156+
if fsContext.checkAndPreventSharingOutsideTG(t.tg) {
156157
stopPrivGain = true
157158
}
158159

@@ -612,8 +613,8 @@ func (t *Task) shouldStopPrivGainDueToPtracerLocked() bool {
612613

613614
// Releases the mutex that serializes an execve(2) with a PTRACE_ATTACH and allows t.fsContext to
614615
// be shared again via clone(2). The caller must have called execveCredsMutexStartLock() and
615-
// fsContext.CheckAndPreventSharingOutsideTG() earlier.
616+
// fsContext.checkAndPreventSharingOutsideTG() earlier.
616617
func (t *Task) releaseExecveCredsLocks() {
617618
t.execveCredsMutexUnlock()
618-
t.fsContext.allowSharing()
619+
t.FSContext().allowSharing()
619620
}

pkg/sentry/kernel/task_exit.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
298298
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
299299
t.unstopVforkParent()
300300

301-
t.fsContext.DecRef(t)
301+
t.FSContext().DecRef(t)
302302
t.fdTable.DecRef(t)
303303

304304
// Detach task from all cgroups. This must happen before potentially the

pkg/sentry/kernel/task_start.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
159159
signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)),
160160
signalStack: linux.SignalStack{Flags: linux.SS_DISABLE},
161161
image: *image,
162-
fsContext: cfg.FSContext,
163162
fdTable: cfg.FDTable,
164163
k: cfg.Kernel,
165164
ptraceTracees: make(map[*Task]struct{}),
@@ -183,6 +182,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
183182
}
184183
t.netns = cfg.NetworkNamespace
185184
t.creds.Store(cfg.Credentials)
185+
t.fsContext.Store(cfg.FSContext)
186186
t.endStopCond.L = &t.tg.signalHandlers.mu
187187
// We don't construct t.blockingTimer until Task.run(); see that function
188188
// for justification.

0 commit comments

Comments
 (0)