Skip to content

Commit 9b44934

Browse files
committed
Fix snapshot automount race causing duplicate mounts and AVL tree panic
Multiple threads racing to automount the same snapshot can both spawn mount helper processes that successfully complete, causing both parent threads to attempt AVL tree registration and triggering a VERIFY() panic in avl_add(). This occurs because the fsconfig/fsmount API lacks the serialization provided by traditional mount() via lock_mount(). The fix adds a per-entry mutex (se_mtx) to zfs_snapentry_t that serializes mount and unmount operations on the same snapshot. The first mount thread creates a pending entry with se_spa=NULL and holds se_mtx during the helper execution. Concurrent mounts find the pending entry and return success without spawning duplicate helpers. Unmount waits on se_mtx if a mount is pending, ensuring proper serialization. This allows different snapshots to mount in parallel while preventing the AVL panic. Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
1 parent 88d012a commit 9b44934

File tree

1 file changed

+104
-38
lines changed

1 file changed

+104
-38
lines changed

module/os/linux/zfs/zfs_ctldir.c

Lines changed: 104 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,17 @@ static int zfs_snapshot_no_setuid = 0;
117117
typedef struct {
118118
char *se_name; /* full snapshot name */
119119
char *se_path; /* full mount path */
120-
spa_t *se_spa; /* pool spa */
120+
spa_t *se_spa; /* pool spa (NULL if pending) */
121121
uint64_t se_objsetid; /* snapshot objset id */
122122
struct dentry *se_root_dentry; /* snapshot root dentry */
123123
taskqid_t se_taskqid; /* scheduled unmount taskqid */
124124
avl_node_t se_node_name; /* zfs_snapshots_by_name link */
125125
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
126126
zfs_refcount_t se_refcount; /* reference count */
127+
kmutex_t se_mtx; /* protects se_mounting and se_cv */
128+
kcondvar_t se_cv; /* signal mount completion */
129+
boolean_t se_mounting; /* mount operation in progress */
130+
int se_mount_error; /* error from failed mount */
127131
} zfs_snapentry_t;
128132

129133
static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
@@ -146,6 +150,10 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
146150
se->se_objsetid = objsetid;
147151
se->se_root_dentry = root_dentry;
148152
se->se_taskqid = TASKQID_INVALID;
153+
mutex_init(&se->se_mtx, NULL, MUTEX_DEFAULT, NULL);
154+
cv_init(&se->se_cv, NULL, CV_DEFAULT, NULL);
155+
se->se_mounting = B_FALSE;
156+
se->se_mount_error = 0;
149157

150158
zfs_refcount_create(&se->se_refcount);
151159

@@ -162,6 +170,8 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
162170
zfs_refcount_destroy(&se->se_refcount);
163171
kmem_strfree(se->se_name);
164172
kmem_strfree(se->se_path);
173+
mutex_destroy(&se->se_mtx);
174+
cv_destroy(&se->se_cv);
165175

166176
kmem_free(se, sizeof (zfs_snapentry_t));
167177
}
@@ -187,34 +197,52 @@ zfsctl_snapshot_rele(zfs_snapentry_t *se)
187197
}
188198

189199
/*
190-
* Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
191-
* zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
192-
* of the trees a reference is held.
200+
* Add a zfs_snapentry_t to the zfs_snapshots_by_name tree. If the entry
201+
* is not pending (se_spa != NULL), also add to zfs_snapshots_by_objsetid.
202+
* While the zfs_snapentry_t is part of the trees a reference is held.
193203
*/
194204
static void
195205
zfsctl_snapshot_add(zfs_snapentry_t *se)
196206
{
197207
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
198208
zfsctl_snapshot_hold(se);
199209
avl_add(&zfs_snapshots_by_name, se);
200-
avl_add(&zfs_snapshots_by_objsetid, se);
210+
if (se->se_spa != NULL)
211+
avl_add(&zfs_snapshots_by_objsetid, se);
201212
}
202213

203214
/*
204-
* Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
205-
* zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
206-
* this can result in the structure being freed if that was the last
207-
* remaining reference.
215+
* Remove a zfs_snapentry_t from the zfs_snapshots_by_name tree and
216+
* zfs_snapshots_by_objsetid tree (if not pending). Upon removal a
217+
* reference is dropped, this can result in the structure being freed
218+
* if that was the last remaining reference.
208219
*/
209220
static void
210221
zfsctl_snapshot_remove(zfs_snapentry_t *se)
211222
{
212223
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
213224
avl_remove(&zfs_snapshots_by_name, se);
214-
avl_remove(&zfs_snapshots_by_objsetid, se);
225+
if (se->se_spa != NULL)
226+
avl_remove(&zfs_snapshots_by_objsetid, se);
215227
zfsctl_snapshot_rele(se);
216228
}
217229

230+
/*
231+
* Fill a pending zfs_snapentry_t after mount succeeds. Fills in the
232+
* remaining fields and adds the entry to the zfs_snapshots_by_objsetid tree.
233+
*/
234+
static void
235+
zfsctl_snapshot_fill(zfs_snapentry_t *se, spa_t *spa, uint64_t objsetid,
236+
struct dentry *root_dentry)
237+
{
238+
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
239+
ASSERT3P(se->se_spa, ==, NULL);
240+
se->se_spa = spa;
241+
se->se_objsetid = objsetid;
242+
se->se_root_dentry = root_dentry;
243+
avl_add(&zfs_snapshots_by_objsetid, se);
244+
}
245+
218246
/*
219247
* Snapshot name comparison function for the zfs_snapshots_by_name.
220248
*/
@@ -312,6 +340,11 @@ zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
312340
se = zfsctl_snapshot_find_by_name(old_snapname);
313341
if (se == NULL)
314342
return (SET_ERROR(ENOENT));
343+
if (se->se_spa == NULL) {
344+
/* Snapshot mount is in progress */
345+
zfsctl_snapshot_rele(se);
346+
return (SET_ERROR(EBUSY));
347+
}
315348

316349
zfsctl_snapshot_remove(se);
317350
kmem_strfree(se->se_name);
@@ -430,26 +463,6 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
430463
return (error);
431464
}
432465

433-
/*
434-
* Check if snapname is currently mounted. Returned non-zero when mounted
435-
* and zero when unmounted.
436-
*/
437-
static boolean_t
438-
zfsctl_snapshot_ismounted(const char *snapname)
439-
{
440-
zfs_snapentry_t *se;
441-
boolean_t ismounted = B_FALSE;
442-
443-
rw_enter(&zfs_snapshot_lock, RW_READER);
444-
if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
445-
zfsctl_snapshot_rele(se);
446-
ismounted = B_TRUE;
447-
}
448-
rw_exit(&zfs_snapshot_lock);
449-
450-
return (ismounted);
451-
}
452-
453466
/*
454467
* Check if the given inode is a part of the virtual .zfs directory.
455468
*/
@@ -1131,6 +1144,14 @@ zfsctl_snapshot_unmount(const char *snapname, int flags)
11311144
}
11321145
rw_exit(&zfs_snapshot_lock);
11331146

1147+
/*
1148+
* Wait for any pending auto-mount to complete before unmounting.
1149+
*/
1150+
mutex_enter(&se->se_mtx);
1151+
while (se->se_mounting)
1152+
cv_wait(&se->se_cv, &se->se_mtx);
1153+
mutex_exit(&se->se_mtx);
1154+
11341155
exportfs_flush();
11351156

11361157
if (flags & MNT_FORCE)
@@ -1232,14 +1253,35 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12321253
zfs_snapshot_no_setuid ? "nosuid" : "suid");
12331254

12341255
/*
1235-
* Multiple concurrent automounts of a snapshot are never allowed.
1236-
* The snapshot may be manually mounted as many times as desired.
1256+
* Check if snapshot is already being mounted. If found, wait for
1257+
* pending mount to complete before returning success.
12371258
*/
1238-
if (zfsctl_snapshot_ismounted(full_name)) {
1239-
error = 0;
1259+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1260+
if ((se = zfsctl_snapshot_find_by_name(full_name)) != NULL) {
1261+
rw_exit(&zfs_snapshot_lock);
1262+
mutex_enter(&se->se_mtx);
1263+
while (se->se_mounting)
1264+
cv_wait(&se->se_cv, &se->se_mtx);
1265+
1266+
/*
1267+
* Return the same error as the first mount attempt (0 if
1268+
* succeeded, error code if failed).
1269+
*/
1270+
error = se->se_mount_error;
1271+
mutex_exit(&se->se_mtx);
1272+
zfsctl_snapshot_rele(se);
12401273
goto error;
12411274
}
12421275

1276+
/*
1277+
* Create pending entry and mark mount in progress.
1278+
*/
1279+
se = zfsctl_snapshot_alloc(full_name, full_path, NULL, 0, NULL);
1280+
se->se_mounting = B_TRUE;
1281+
zfsctl_snapshot_add(se);
1282+
zfsctl_snapshot_hold(se);
1283+
rw_exit(&zfs_snapshot_lock);
1284+
12431285
/*
12441286
* Attempt to mount the snapshot from user space. Normally this
12451287
* would be done using the vfs_kern_mount() function, however that
@@ -1258,6 +1300,9 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12581300
argv[9] = full_path;
12591301
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
12601302
if (error) {
1303+
/*
1304+
* Mount failed - cleanup pending entry and signal waiters.
1305+
*/
12611306
if (!(error & MOUNT_BUSY << 8)) {
12621307
zfs_dbgmsg("Unable to automount %s error=%d",
12631308
full_path, error);
@@ -1273,6 +1318,16 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12731318
*/
12741319
error = 0;
12751320
}
1321+
1322+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1323+
zfsctl_snapshot_remove(se);
1324+
rw_exit(&zfs_snapshot_lock);
1325+
mutex_enter(&se->se_mtx);
1326+
se->se_mount_error = error;
1327+
se->se_mounting = B_FALSE;
1328+
cv_broadcast(&se->se_cv);
1329+
mutex_exit(&se->se_mtx);
1330+
zfsctl_snapshot_rele(se);
12761331
goto error;
12771332
}
12781333

@@ -1289,14 +1344,25 @@ zfsctl_snapshot_mount(struct path *path, int flags)
12891344
spath.mnt->mnt_flags |= MNT_SHRINKABLE;
12901345

12911346
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1292-
se = zfsctl_snapshot_alloc(full_name, full_path,
1293-
snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
1294-
dentry);
1295-
zfsctl_snapshot_add(se);
1347+
zfsctl_snapshot_fill(se, snap_zfsvfs->z_os->os_spa,
1348+
dmu_objset_id(snap_zfsvfs->z_os), dentry);
12961349
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
12971350
rw_exit(&zfs_snapshot_lock);
1351+
} else {
1352+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
1353+
zfsctl_snapshot_remove(se);
1354+
rw_exit(&zfs_snapshot_lock);
12981355
}
12991356
path_put(&spath);
1357+
1358+
/*
1359+
* Signal mount completion and cleanup.
1360+
*/
1361+
mutex_enter(&se->se_mtx);
1362+
se->se_mounting = B_FALSE;
1363+
cv_broadcast(&se->se_cv);
1364+
mutex_exit(&se->se_mtx);
1365+
zfsctl_snapshot_rele(se);
13001366
error:
13011367
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
13021368
kmem_free(full_path, MAXPATHLEN);

0 commit comments

Comments
 (0)