From cf0f3e37e614723425c0fdfa8470b6c9b16729e5 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Tue, 21 Oct 2025 14:29:39 +0200 Subject: [PATCH 1/4] do_change_type(): refuse to operate on unmounted/not ours mounts jira VULN-98610 cve CVE-2025-38498 commit-author Al Viro commit 12f147ddd6de7382dad54812e65f3f08d05809fc Ensure that propagation settings can only be changed for mounts located in the caller's mount namespace. This change aligns permission checking with the rest of mount(2). Reviewed-by: Christian Brauner Fixes: 07b20889e305 ("beginning of the shared-subtree proper") Reported-by: "Orlando, Noah" Signed-off-by: Al Viro (cherry picked from commit 12f147ddd6de7382dad54812e65f3f08d05809fc) Signed-off-by: Roxana Nicolescu --- fs/namespace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 24f32c1eb6013..8c3600a91eff5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2329,6 +2329,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); + if (!check_mnt(mnt)) { + err = -EINVAL; + goto out_unlock; + } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) From 71ac24bf28111fce77f3bfed7de221fb77426108 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 27 Oct 2025 15:27:31 +0100 Subject: [PATCH 2/4] move_mount: allow to add a mount into an existing group jira VULN-98610 cve-bf CVE-2025-38498 commit-author Pavel Tikhomirov commit 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2 Previously a sharing group (shared and master ids pair) can be only inherited when mount is created via bindmount. This patch adds an ability to add an existing private mount into an existing sharing group. With this functionality one can first create the desired mount tree from only private mounts (without the need to care about undesired mount propagation or mount creation order implied by sharing group dependencies), and next then setup any desired mount sharing between those mounts in tree as needed. This allows CRIU to restore any set of mount namespaces, mount trees and sharing group trees for a container. We have many issues with restoring mounts in CRIU related to sharing groups and propagation: - reverse sharing groups vs mount tree order requires complex mounts reordering which mostly implies also using some temporary mounts (please see https://lkml.org/lkml/2021/3/23/569 for more info) - mount() syscall creates tons of mounts due to propagation - mount re-parenting due to propagation - "Mount Trap" due to propagation - "Non Uniform" propagation, meaning that with different tricks with mount order and temporary children-"lock" mounts one can create mount trees which can't be restored without those tricks (see https://www.linuxplumbersconf.org/event/7/contributions/640/) With this new functionality we can resolve all the problems with propagation at once. Link: https://lore.kernel.org/r/20210715100714.120228-1-ptikhomirov@virtuozzo.com Cc: Eric W. Biederman Cc: Alexander Viro Cc: Christian Brauner Cc: Mattias Nissler Cc: Aleksa Sarai Cc: Andrei Vagin Cc: linux-fsdevel@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: lkml Co-developed-by: Andrei Vagin Acked-by: Christian Brauner Signed-off-by: Pavel Tikhomirov Signed-off-by: Andrei Vagin Signed-off-by: Christian Brauner (cherry picked from commit 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2) Signed-off-by: Roxana Nicolescu --- fs/namespace.c | 77 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/mount.h | 3 +- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8c3600a91eff5..971a1b91cac67 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2705,6 +2705,78 @@ static bool check_for_nsfs_mounts(struct mount *subtree) return ret; } +static int do_set_group(struct path *from_path, struct path *to_path) +{ + struct mount *from, *to; + int err; + + from = real_mount(from_path->mnt); + to = real_mount(to_path->mnt); + + namespace_lock(); + + err = -EINVAL; + /* To and From must be mounted */ + if (!is_mounted(&from->mnt)) + goto out; + if (!is_mounted(&to->mnt)) + goto out; + + err = -EPERM; + /* We should be allowed to modify mount namespaces of both mounts */ + if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + /* To and From paths should be mount roots */ + if (from_path->dentry != from_path->mnt->mnt_root) + goto out; + if (to_path->dentry != to_path->mnt->mnt_root) + goto out; + + /* Setting sharing groups is only allowed across same superblock */ + if (from->mnt.mnt_sb != to->mnt.mnt_sb) + goto out; + + /* From mount root should be wider than To mount root */ + if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) + goto out; + + /* From mount should not have locked children in place of To's root */ + if (has_locked_children(from, to->mnt.mnt_root)) + goto out; + + /* Setting sharing groups is only allowed on private mounts */ + if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) + goto out; + + /* From should not be private */ + if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) + goto out; + + if (IS_MNT_SLAVE(from)) { + struct mount *m = from->mnt_master; + + list_add(&to->mnt_slave, &m->mnt_slave_list); + to->mnt_master = m; + } + + if (IS_MNT_SHARED(from)) { + to->mnt_group_id = from->mnt_group_id; + list_add(&to->mnt_share, &from->mnt_share); + lock_mount_hash(); + set_mnt_shared(to); + unlock_mount_hash(); + } + + err = 0; +out: + namespace_unlock(); + return err; +} + static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; @@ -3689,7 +3761,10 @@ SYSCALL_DEFINE5(move_mount, if (ret < 0) goto out_to; - ret = do_move_mount(&from_path, &to_path); + if (flags & MOVE_MOUNT_SET_GROUP) + ret = do_set_group(&from_path, &to_path); + else + ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index dd7a166fdf9c2..4d93967f8aea0 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -73,7 +73,8 @@ #define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ #define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ -#define MOVE_MOUNT__MASK 0x00000077 +#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ +#define MOVE_MOUNT__MASK 0x00000177 /* * fsopen() flags. From 64d6ebe22bd251efd5a375a884d91136d69e05ca Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 27 Oct 2025 15:29:20 +0100 Subject: [PATCH 3/4] fix propagation graph breakage by MOVE_MOUNT_SET_GROUP move_mount(2) jira VULN-98610 cve-bf CVE-2025-38498 commit-author Al Viro commit d8cc0362f918d020ca1340d7694f07062dc30f36 9ffb14ef61ba "move_mount: allow to add a mount into an existing group" breaks assertions on ->mnt_share/->mnt_slave. For once, the data structures in question are actually documented. Documentation/filesystem/sharedsubtree.rst: All vfsmounts in a peer group have the same ->mnt_master. If it is non-NULL, they form a contiguous (ordered) segment of slave list. do_set_group() puts a mount into the same place in propagation graph as the old one. As the result, if old mount gets events from somewhere and is not a pure event sink, new one needs to be placed next to the old one in the slave list the old one's on. If it is a pure event sink, we only need to make sure the new one doesn't end up in the middle of some peer group. "move_mount: allow to add a mount into an existing group" ends up putting the new one in the beginning of list; that's definitely not going to be in the middle of anything, so that's fine for case when old is not marked shared. In case when old one _is_ marked shared (i.e. is not a pure event sink), that breaks the assumptions of propagation graph iterators. Put the new mount next to the old one on the list - that does the right thing in "old is marked shared" case and is just as correct as the current behaviour if old is not marked shared (kudos to Pavel for pointing that out - my original suggested fix changed behaviour in the "nor marked" case, which complicated things for no good reason). Reviewed-by: Christian Brauner Fixes: 9ffb14ef61ba ("move_mount: allow to add a mount into an existing group") Signed-off-by: Al Viro (cherry picked from commit d8cc0362f918d020ca1340d7694f07062dc30f36) Signed-off-by: Roxana Nicolescu --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 971a1b91cac67..8957b1e677f36 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2759,7 +2759,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) if (IS_MNT_SLAVE(from)) { struct mount *m = from->mnt_master; - list_add(&to->mnt_slave, &m->mnt_slave_list); + list_add(&to->mnt_slave, &from->mnt_slave); to->mnt_master = m; } From 6fc92226e201c864189d492da4fa50e8463477ae Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 27 Oct 2025 15:29:46 +0100 Subject: [PATCH 4/4] use uniform permission checks for all mount propagation changes jira VULN-98610 cve-bf CVE-2025-38498 commit-author Al Viro commit cffd0441872e7f6b1fce5e78fb1c99187a291330 do_change_type() and do_set_group() are operating on different aspects of the same thing - propagation graph. The latter asks for mounts involved to be mounted in namespace(s) the caller has CAP_SYS_ADMIN for. The former is a mess - originally it didn't even check that mount *is* mounted. That got fixed, but the resulting check turns out to be too strict for userland - in effect, we check that mount is in our namespace, having already checked that we have CAP_SYS_ADMIN there. What we really need (in both cases) is * only touch mounts that are mounted. That's a must-have constraint - data corruption happens if it get violated. * don't allow to mess with a namespace unless you already have enough permissions to do so (i.e. CAP_SYS_ADMIN in its userns). That's an equivalent of what do_set_group() does; let's extract that into a helper (may_change_propagation()) and use it in both do_set_group() and do_change_type(). Fixes: 12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts" Acked-by: Andrei Vagin Reviewed-by: Pavel Tikhomirov Tested-by: Pavel Tikhomirov Reviewed-by: Christian Brauner Signed-off-by: Al Viro (cherry picked from commit cffd0441872e7f6b1fce5e78fb1c99187a291330) Signed-off-by: Roxana Nicolescu --- fs/namespace.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8957b1e677f36..a5d42d6c4cc9b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2293,6 +2293,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) return attach_recursive_mnt(mnt, p, mp, false); } +static int may_change_propagation(const struct mount *m) +{ + struct mnt_namespace *ns = m->mnt_ns; + + // it must be mounted in some namespace + if (IS_ERR_OR_NULL(ns)) // is_mounted() + return -EINVAL; + // and the caller must be admin in userns of that namespace + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + /* * Sanity check the flags to change_mnt_propagation. */ @@ -2329,10 +2342,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); - if (!check_mnt(mnt)) { - err = -EINVAL; + err = may_change_propagation(mnt); + if (err) goto out_unlock; - } + if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -2715,18 +2728,11 @@ static int do_set_group(struct path *from_path, struct path *to_path) namespace_lock(); - err = -EINVAL; - /* To and From must be mounted */ - if (!is_mounted(&from->mnt)) - goto out; - if (!is_mounted(&to->mnt)) - goto out; - - err = -EPERM; - /* We should be allowed to modify mount namespaces of both mounts */ - if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + err = may_change_propagation(from); + if (err) goto out; - if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + err = may_change_propagation(to); + if (err) goto out; err = -EINVAL;