Skip to content

Commit e1d2468

Browse files
tranji-cloudavagin
authored andcommitted
Prevent inode sharing for directory dentries by by-passing cache.
Introduce `dontCache` argument in `getOrCreateInode`. In both directfs and lisafs, set the new arg to true if the inode represents a directory in new*fsDentry. This avoids issues with hard-linking directories and correctly handles scenarios involving bind mounts on the host, where different paths can map to the same device and inode number for a directory. Co-authored-by: Andrei Vagin <avagin@google.com> PiperOrigin-RevId: 813023661
1 parent 3a84651 commit e1d2468

File tree

4 files changed

+122
-95
lines changed

4 files changed

+122
-95
lines changed

pkg/sentry/fsimpl/gofer/directfs_inode.go

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) {
123123
_ = unix.Close(controlFD)
124124
return nil, err
125125
}
126+
isDir := stat.Mode&linux.FileTypeMask == linux.ModeDirectory
126127
inoKey := inoKeyFromStat(&stat)
127128

128129
// Common case. Performance hack which is used to allocate the dentry
@@ -134,30 +135,36 @@ func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) {
134135
d dentry
135136
i directfsInode
136137
}{}
137-
temp.d.inode = fs.getOrCreateInode(inoKey, func() { _ = unix.Close(controlFD) }, func() *inode {
138-
temp.i = directfsInode{
139-
inode: inode{
140-
fs: fs,
141-
inoKey: inoKey,
142-
ino: fs.inoFromKey(inoKey),
143-
mode: atomicbitops.FromUint32(stat.Mode),
144-
uid: atomicbitops.FromUint32(stat.Uid),
145-
gid: atomicbitops.FromUint32(stat.Gid),
146-
blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)),
147-
readFD: atomicbitops.FromInt32(-1),
148-
writeFD: atomicbitops.FromInt32(-1),
149-
mmapFD: atomicbitops.FromInt32(-1),
150-
size: atomicbitops.FromUint64(uint64(stat.Size)),
151-
atime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)),
152-
mtime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)),
153-
ctime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)),
154-
nlink: atomicbitops.FromUint32(uint32(stat.Nlink)),
155-
},
156-
controlFD: controlFD,
157-
}
158-
temp.i.inode.init(&temp.i)
159-
return &temp.i.inode
160-
})
138+
// Force new inode creation for directory inodes to avoid hard-linking directories.
139+
// This also avoids a correctness issue when a directory is bind-mounted on the host:
140+
// different paths (e.g., /mnt/ and /mnt/a/b/c if /mnt/a/b/c is a bind mount of /mnt/)
141+
// can return the same device ID and inode number from a stat call.
142+
temp.d.inode = fs.getOrCreateInode(inoKey /* dontCache = */, isDir,
143+
func() { _ = unix.Close(controlFD) },
144+
func() *inode {
145+
temp.i = directfsInode{
146+
inode: inode{
147+
fs: fs,
148+
inoKey: inoKey,
149+
ino: fs.inoFromKey(inoKey),
150+
mode: atomicbitops.FromUint32(stat.Mode),
151+
uid: atomicbitops.FromUint32(stat.Uid),
152+
gid: atomicbitops.FromUint32(stat.Gid),
153+
blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)),
154+
readFD: atomicbitops.FromInt32(-1),
155+
writeFD: atomicbitops.FromInt32(-1),
156+
mmapFD: atomicbitops.FromInt32(-1),
157+
size: atomicbitops.FromUint64(uint64(stat.Size)),
158+
atime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)),
159+
mtime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)),
160+
ctime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)),
161+
nlink: atomicbitops.FromUint32(uint32(stat.Nlink)),
162+
},
163+
controlFD: controlFD,
164+
}
165+
temp.i.inode.init(&temp.i)
166+
return &temp.i.inode
167+
})
161168

162169
temp.d.init()
163170
fs.syncMu.Lock()

pkg/sentry/fsimpl/gofer/filesystem.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,10 @@ func (fs *filesystem) getRemoteChildLocked(ctx context.Context, parent *dentry,
283283
fs.inodeMu.Lock()
284284
child.inode.refs.DecRef(func() {
285285
destroyInode = true
286-
delete(fs.inodeByKey, child.inode.inoKey)
286+
if !child.isDir() {
287+
// Only non-directory inodes are cached in inodeByKey.
288+
delete(fs.inodeByKey, child.inode.inoKey)
289+
}
287290
})
288291
fs.inodeMu.Unlock()
289292
child.destroyDisconnected(ctx, destroyInode)

pkg/sentry/fsimpl/gofer/gofer.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,11 @@ type filesystem struct {
239239
// files across checkpoint/restore. inoByKey is protected by inoMu.
240240
inoMu sync.Mutex `state:"nosave"`
241241
inoByKey map[inoKey]uint64 `state:"nosave"`
242-
// inodeByKey maps internal inodeKeys [virtual ino and device ID] to inodes. inodeByKey is
243-
// protected by inodeMu. inodeByKey is similar to inoByKey, except that is not an
244-
// ever-growing map. As inodes are destroyed, its entry in this map is cleared.
242+
// inodeByKey maps inoKey to non-directory inodes. inodeByKey is protected by
243+
// inodeMu. inodeByKey is similar to inoByKey, except that is not an
244+
// ever-growing map. This is to allow for the sharing of inodes across
245+
// dentries. As non-directory dentries are destroyed, their associated inode
246+
// entry in this map is cleared.
245247
inodeMu sync.Mutex `state:"nosave"`
246248
inodeByKey map[inoKey]*inode `state:"nosave"`
247249

@@ -260,12 +262,20 @@ type filesystem struct {
260262
released atomicbitops.Int32
261263
}
262264

263-
// getOrCreateInode returns the inode for the given inoKey and upon inode cache
264-
// hit, calls onCacheHit and increments the inode's reference count. On cache
265-
// miss, createInode is called and the new inode is returned.
266-
// getOrCreateInode locks fs.inodeMu.
265+
// getOrCreateInode returns an inode for the given inoKey, to avoid creating
266+
// multiple inodes objects for the same inoKey. This is useful for sharing
267+
// inodes across dentries.
268+
//
269+
// If a miss, `createInode` is called and the result is cached. If a hit,
270+
// `onCacheHit` is called. This path locks fs.inodeMu.
271+
//
267272
// The caller is responsible for decrementing the inode's reference count when done.
268-
func (fs *filesystem) getOrCreateInode(inoKey inoKey, onCacheHit func(), createInode func() *inode) *inode {
273+
func (fs *filesystem) getOrCreateInode(inoKey inoKey, dontCache bool, onCacheHit func(), createInode func() *inode) *inode {
274+
if dontCache {
275+
// Create a new inode and return it bypassing the cache. This is used for
276+
// creating inodes that are not meant to be shared across dentries.
277+
return createInode()
278+
}
269279
fs.inodeMu.Lock()
270280
defer fs.inodeMu.Unlock()
271281
if cachedInode, ok := fs.inodeByKey[inoKey]; ok {
@@ -1886,7 +1896,10 @@ func (d *dentry) destroyLocked(ctx context.Context) {
18861896
d.inode.fs.inodeMu.Lock()
18871897
d.inode.refs.DecRef(func() {
18881898
destroyInode = true
1889-
delete(d.inode.fs.inodeByKey, d.inode.inoKey)
1899+
if !d.isDir() {
1900+
// Only non-directory inodes are cached in inodeByKey.
1901+
delete(d.inode.fs.inodeByKey, d.inode.inoKey)
1902+
}
18901903
})
18911904
d.inode.fs.inodeMu.Unlock()
18921905

pkg/sentry/fsimpl/gofer/lisafs_inode.go

Lines changed: 65 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (*
114114
fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */)
115115
return nil, linuxerr.EIO
116116
}
117-
117+
isDir := ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory
118118
inoKey := inoKeyFromStatx(&ino.Stat)
119119
// Common case. Performance hack which is used to allocate the dentry
120120
// and its inode together in the heap. This will help reduce allocations and memory
@@ -125,69 +125,73 @@ func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (*
125125
d dentry
126126
i lisafsInode
127127
}{}
128-
temp.d.inode = fs.getOrCreateInode(inoKey, func() {
129-
fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */)
130-
}, func() *inode {
131-
temp.i = lisafsInode{
132-
inode: inode{
133-
fs: fs,
134-
inoKey: inoKey,
135-
ino: fs.inoFromKey(inoKey),
136-
mode: atomicbitops.FromUint32(uint32(ino.Stat.Mode)),
137-
uid: atomicbitops.FromUint32(uint32(fs.opts.dfltuid)),
138-
gid: atomicbitops.FromUint32(uint32(fs.opts.dfltgid)),
139-
blockSize: atomicbitops.FromUint32(hostarch.PageSize),
140-
readFD: atomicbitops.FromInt32(-1),
141-
writeFD: atomicbitops.FromInt32(-1),
142-
mmapFD: atomicbitops.FromInt32(-1),
143-
},
144-
controlFD: fs.client.NewFD(ino.ControlFD),
145-
}
146-
temp.i.inode.init(&temp.i)
147-
inode := &temp.i.inode
148-
if ino.Stat.Mask&linux.STATX_UID != 0 {
149-
inode.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID)))
150-
}
151-
if ino.Stat.Mask&linux.STATX_GID != 0 {
152-
inode.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID)))
153-
}
154-
if ino.Stat.Mask&linux.STATX_SIZE != 0 {
155-
inode.size = atomicbitops.FromUint64(ino.Stat.Size)
156-
}
157-
if ino.Stat.Blksize != 0 {
158-
inode.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize)
159-
}
160-
if ino.Stat.Mask&linux.STATX_ATIME != 0 {
161-
inode.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime))
162-
} else {
163-
inode.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds())
164-
}
165-
if ino.Stat.Mask&linux.STATX_MTIME != 0 {
166-
inode.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime))
167-
} else {
168-
inode.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds())
169-
}
170-
if ino.Stat.Mask&linux.STATX_CTIME != 0 {
171-
inode.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime))
172-
} else {
173-
// Approximate ctime with mtime if ctime isn't available.
174-
inode.ctime = atomicbitops.FromInt64(inode.mtime.Load())
175-
}
176-
if ino.Stat.Mask&linux.STATX_BTIME != 0 {
177-
inode.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime))
178-
}
128+
// Force new inode creation for directory inodes to avoid hard-linking directories.
129+
// This also avoids a correctness issue when a directory is bind-mounted on the host:
130+
// different paths (e.g., /mnt/ and /mnt/a/b/c if /mnt/a/b/c is a bind mount of /mnt/)
131+
// can return the same device ID and inode number from a stat call.
132+
temp.d.inode = fs.getOrCreateInode(inoKey /* dontCache = */, isDir,
133+
func() { fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */) },
134+
func() *inode {
135+
temp.i = lisafsInode{
136+
inode: inode{
137+
fs: fs,
138+
inoKey: inoKey,
139+
ino: fs.inoFromKey(inoKey),
140+
mode: atomicbitops.FromUint32(uint32(ino.Stat.Mode)),
141+
uid: atomicbitops.FromUint32(uint32(fs.opts.dfltuid)),
142+
gid: atomicbitops.FromUint32(uint32(fs.opts.dfltgid)),
143+
blockSize: atomicbitops.FromUint32(hostarch.PageSize),
144+
readFD: atomicbitops.FromInt32(-1),
145+
writeFD: atomicbitops.FromInt32(-1),
146+
mmapFD: atomicbitops.FromInt32(-1),
147+
},
148+
controlFD: fs.client.NewFD(ino.ControlFD),
149+
}
150+
temp.i.inode.init(&temp.i)
151+
inode := &temp.i.inode
152+
if ino.Stat.Mask&linux.STATX_UID != 0 {
153+
inode.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID)))
154+
}
155+
if ino.Stat.Mask&linux.STATX_GID != 0 {
156+
inode.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID)))
157+
}
158+
if ino.Stat.Mask&linux.STATX_SIZE != 0 {
159+
inode.size = atomicbitops.FromUint64(ino.Stat.Size)
160+
}
161+
if ino.Stat.Blksize != 0 {
162+
inode.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize)
163+
}
164+
if ino.Stat.Mask&linux.STATX_ATIME != 0 {
165+
inode.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime))
166+
} else {
167+
inode.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds())
168+
}
169+
if ino.Stat.Mask&linux.STATX_MTIME != 0 {
170+
inode.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime))
171+
} else {
172+
inode.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds())
173+
}
174+
if ino.Stat.Mask&linux.STATX_CTIME != 0 {
175+
inode.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime))
176+
} else {
177+
// Approximate ctime with mtime if ctime isn't available.
178+
inode.ctime = atomicbitops.FromInt64(inode.mtime.Load())
179+
}
180+
if ino.Stat.Mask&linux.STATX_BTIME != 0 {
181+
inode.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime))
182+
}
179183

180-
if ino.Stat.Mask&linux.STATX_NLINK != 0 {
181-
inode.nlink = atomicbitops.FromUint32(ino.Stat.Nlink)
182-
} else {
183-
if ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory {
184-
inode.nlink = atomicbitops.FromUint32(2)
184+
if ino.Stat.Mask&linux.STATX_NLINK != 0 {
185+
inode.nlink = atomicbitops.FromUint32(ino.Stat.Nlink)
185186
} else {
186-
inode.nlink = atomicbitops.FromUint32(1)
187+
if isDir {
188+
inode.nlink = atomicbitops.FromUint32(2)
189+
} else {
190+
inode.nlink = atomicbitops.FromUint32(1)
191+
}
187192
}
188-
}
189-
return inode
190-
})
193+
return inode
194+
})
191195

192196
temp.d.init()
193197
fs.syncMu.Lock()

0 commit comments

Comments
 (0)