From cf0f3e37e614723425c0fdfa8470b6c9b16729e5 Mon Sep 17 00:00:00 2001
From: Roxana Nicolescu <rnicolescu@ciq.com>
Date: Tue, 21 Oct 2025 14:29:39 +0200
Subject: [PATCH 1/4] do_change_type(): refuse to operate on unmounted/not ours
 mounts

jira VULN-98610
cve CVE-2025-38498
commit-author Al Viro <viro@zeniv.linux.org.uk>
commit 12f147ddd6de7382dad54812e65f3f08d05809fc

Ensure that propagation settings can only be changed for mounts located
in the caller's mount namespace. This change aligns permission checking
with the rest of mount(2).

	Reviewed-by: Christian Brauner <brauner@kernel.org>
Fixes: 07b20889e305 ("beginning of the shared-subtree proper")
	Reported-by: "Orlando, Noah" <Noah.Orlando@deshaw.com>
	Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from commit 12f147ddd6de7382dad54812e65f3f08d05809fc)
	Signed-off-by: Roxana Nicolescu <rnicolescu@ciq.com>
---
 fs/namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 24f32c1eb6013..8c3600a91eff5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2329,6 +2329,10 @@ static int do_change_type(struct path *path, int ms_flags)
 		return -EINVAL;
 
 	namespace_lock();
+	if (!check_mnt(mnt)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)

From 71ac24bf28111fce77f3bfed7de221fb77426108 Mon Sep 17 00:00:00 2001
From: Roxana Nicolescu <rnicolescu@ciq.com>
Date: Mon, 27 Oct 2025 15:27:31 +0100
Subject: [PATCH 2/4] move_mount: allow to add a mount into an existing group

jira VULN-98610
cve-bf CVE-2025-38498
commit-author Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
commit 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2

Previously a sharing group (shared and master ids pair) can be only
inherited when mount is created via bindmount. This patch adds an
ability to add an existing private mount into an existing sharing group.

With this functionality one can first create the desired mount tree from
only private mounts (without the need to care about undesired mount
propagation or mount creation order implied by sharing group
dependencies), and next then setup any desired mount sharing between
those mounts in tree as needed.

This allows CRIU to restore any set of mount namespaces, mount trees and
sharing group trees for a container.

We have many issues with restoring mounts in CRIU related to sharing
groups and propagation:
- reverse sharing groups vs mount tree order requires complex mounts
  reordering which mostly implies also using some temporary mounts
(please see https://lkml.org/lkml/2021/3/23/569 for more info)

- mount() syscall creates tons of mounts due to propagation
- mount re-parenting due to propagation
- "Mount Trap" due to propagation
- "Non Uniform" propagation, meaning that with different tricks with
  mount order and temporary children-"lock" mounts one can create mount
  trees which can't be restored without those tricks
(see https://www.linuxplumbersconf.org/event/7/contributions/640/)

With this new functionality we can resolve all the problems with
propagation at once.

Link: https://lore.kernel.org/r/20210715100714.120228-1-ptikhomirov@virtuozzo.com
	Cc: Eric W. Biederman <ebiederm@xmission.com>
	Cc: Alexander Viro <viro@zeniv.linux.org.uk>
	Cc: Christian Brauner <christian.brauner@ubuntu.com>
	Cc: Mattias Nissler <mnissler@chromium.org>
	Cc: Aleksa Sarai <cyphar@cyphar.com>
	Cc: Andrei Vagin <avagin@gmail.com>
	Cc: linux-fsdevel@vger.kernel.org
	Cc: linux-api@vger.kernel.org
	Cc: lkml <linux-kernel@vger.kernel.org>
Co-developed-by: Andrei Vagin <avagin@gmail.com>
	Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
	Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
	Signed-off-by: Andrei Vagin <avagin@gmail.com>
	Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
(cherry picked from commit 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2)
	Signed-off-by: Roxana Nicolescu <rnicolescu@ciq.com>
---
 fs/namespace.c             | 77 +++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/mount.h |  3 +-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 8c3600a91eff5..971a1b91cac67 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2705,6 +2705,78 @@ static bool check_for_nsfs_mounts(struct mount *subtree)
 	return ret;
 }
 
+static int do_set_group(struct path *from_path, struct path *to_path)
+{
+	struct mount *from, *to;
+	int err;
+
+	from = real_mount(from_path->mnt);
+	to = real_mount(to_path->mnt);
+
+	namespace_lock();
+
+	err = -EINVAL;
+	/* To and From must be mounted */
+	if (!is_mounted(&from->mnt))
+		goto out;
+	if (!is_mounted(&to->mnt))
+		goto out;
+
+	err = -EPERM;
+	/* We should be allowed to modify mount namespaces of both mounts */
+	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+
+	err = -EINVAL;
+	/* To and From paths should be mount roots */
+	if (from_path->dentry != from_path->mnt->mnt_root)
+		goto out;
+	if (to_path->dentry != to_path->mnt->mnt_root)
+		goto out;
+
+	/* Setting sharing groups is only allowed across same superblock */
+	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
+		goto out;
+
+	/* From mount root should be wider than To mount root */
+	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
+		goto out;
+
+	/* From mount should not have locked children in place of To's root */
+	if (has_locked_children(from, to->mnt.mnt_root))
+		goto out;
+
+	/* Setting sharing groups is only allowed on private mounts */
+	if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
+		goto out;
+
+	/* From should not be private */
+	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
+		goto out;
+
+	if (IS_MNT_SLAVE(from)) {
+		struct mount *m = from->mnt_master;
+
+		list_add(&to->mnt_slave, &m->mnt_slave_list);
+		to->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(from)) {
+		to->mnt_group_id = from->mnt_group_id;
+		list_add(&to->mnt_share, &from->mnt_share);
+		lock_mount_hash();
+		set_mnt_shared(to);
+		unlock_mount_hash();
+	}
+
+	err = 0;
+out:
+	namespace_unlock();
+	return err;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
 	struct mnt_namespace *ns;
@@ -3689,7 +3761,10 @@ SYSCALL_DEFINE5(move_mount,
 	if (ret < 0)
 		goto out_to;
 
-	ret = do_move_mount(&from_path, &to_path);
+	if (flags & MOVE_MOUNT_SET_GROUP)
+		ret = do_set_group(&from_path, &to_path);
+	else
+		ret = do_move_mount(&from_path, &to_path);
 
 out_to:
 	path_put(&to_path);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index dd7a166fdf9c2..4d93967f8aea0 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -73,7 +73,8 @@
 #define MOVE_MOUNT_T_SYMLINKS		0x00000010 /* Follow symlinks on to path */
 #define MOVE_MOUNT_T_AUTOMOUNTS		0x00000020 /* Follow automounts on to path */
 #define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
-#define MOVE_MOUNT__MASK		0x00000077
+#define MOVE_MOUNT_SET_GROUP		0x00000100 /* Set sharing group instead */
+#define MOVE_MOUNT__MASK		0x00000177
 
 /*
  * fsopen() flags.

From 64d6ebe22bd251efd5a375a884d91136d69e05ca Mon Sep 17 00:00:00 2001
From: Roxana Nicolescu <rnicolescu@ciq.com>
Date: Mon, 27 Oct 2025 15:29:20 +0100
Subject: [PATCH 3/4] fix propagation graph breakage by MOVE_MOUNT_SET_GROUP
 move_mount(2)

jira VULN-98610
cve-bf CVE-2025-38498
commit-author Al Viro <viro@zeniv.linux.org.uk>
commit d8cc0362f918d020ca1340d7694f07062dc30f36

9ffb14ef61ba "move_mount: allow to add a mount into an existing group"
breaks assertions on ->mnt_share/->mnt_slave.  For once, the data structures
in question are actually documented.

Documentation/filesystem/sharedsubtree.rst:
        All vfsmounts in a peer group have the same ->mnt_master.  If it is
	non-NULL, they form a contiguous (ordered) segment of slave list.

do_set_group() puts a mount into the same place in propagation graph
as the old one.  As the result, if old mount gets events from somewhere
and is not a pure event sink, new one needs to be placed next to the
old one in the slave list the old one's on.  If it is a pure event
sink, we only need to make sure the new one doesn't end up in the
middle of some peer group.

"move_mount: allow to add a mount into an existing group" ends up putting
the new one in the beginning of list; that's definitely not going to be
in the middle of anything, so that's fine for case when old is not marked
shared.  In case when old one _is_ marked shared (i.e. is not a pure event
sink), that breaks the assumptions of propagation graph iterators.

Put the new mount next to the old one on the list - that does the right thing
in "old is marked shared" case and is just as correct as the current behaviour
if old is not marked shared (kudos to Pavel for pointing that out - my original
suggested fix changed behaviour in the "nor marked" case, which complicated
things for no good reason).

	Reviewed-by: Christian Brauner <brauner@kernel.org>
Fixes: 9ffb14ef61ba ("move_mount: allow to add a mount into an existing group")
	Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from commit d8cc0362f918d020ca1340d7694f07062dc30f36)
	Signed-off-by: Roxana Nicolescu <rnicolescu@ciq.com>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 971a1b91cac67..8957b1e677f36 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2759,7 +2759,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 	if (IS_MNT_SLAVE(from)) {
 		struct mount *m = from->mnt_master;
 
-		list_add(&to->mnt_slave, &m->mnt_slave_list);
+		list_add(&to->mnt_slave, &from->mnt_slave);
 		to->mnt_master = m;
 	}
 

From 6fc92226e201c864189d492da4fa50e8463477ae Mon Sep 17 00:00:00 2001
From: Roxana Nicolescu <rnicolescu@ciq.com>
Date: Mon, 27 Oct 2025 15:29:46 +0100
Subject: [PATCH 4/4] use uniform permission checks for all mount propagation
 changes

jira VULN-98610
cve-bf CVE-2025-38498
commit-author Al Viro <viro@zeniv.linux.org.uk>
commit cffd0441872e7f6b1fce5e78fb1c99187a291330

do_change_type() and do_set_group() are operating on different
aspects of the same thing - propagation graph.  The latter
asks for mounts involved to be mounted in namespace(s) the caller
has CAP_SYS_ADMIN for.  The former is a mess - originally it
didn't even check that mount *is* mounted.  That got fixed,
but the resulting check turns out to be too strict for userland -
in effect, we check that mount is in our namespace, having already
checked that we have CAP_SYS_ADMIN there.

What we really need (in both cases) is
	* only touch mounts that are mounted.  That's a must-have
constraint - data corruption happens if it get violated.
	* don't allow to mess with a namespace unless you already
have enough permissions to do so (i.e. CAP_SYS_ADMIN in its userns).

That's an equivalent of what do_set_group() does; let's extract that
into a helper (may_change_propagation()) and use it in both
do_set_group() and do_change_type().

Fixes: 12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts"
	Acked-by: Andrei Vagin <avagin@gmail.com>
	Reviewed-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
	Tested-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
	Reviewed-by: Christian Brauner <brauner@kernel.org>
	Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from commit cffd0441872e7f6b1fce5e78fb1c99187a291330)
	Signed-off-by: Roxana Nicolescu <rnicolescu@ciq.com>
---
 fs/namespace.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 8957b1e677f36..a5d42d6c4cc9b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2293,6 +2293,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	return attach_recursive_mnt(mnt, p, mp, false);
 }
 
+static int may_change_propagation(const struct mount *m)
+{
+        struct mnt_namespace *ns = m->mnt_ns;
+
+	 // it must be mounted in some namespace
+	 if (IS_ERR_OR_NULL(ns))         // is_mounted()
+		 return -EINVAL;
+	 // and the caller must be admin in userns of that namespace
+	 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		 return -EPERM;
+	 return 0;
+}
+
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
@@ -2329,10 +2342,10 @@ static int do_change_type(struct path *path, int ms_flags)
 		return -EINVAL;
 
 	namespace_lock();
-	if (!check_mnt(mnt)) {
-		err = -EINVAL;
+	err = may_change_propagation(mnt);
+	if (err)
 		goto out_unlock;
-	}
+
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
@@ -2715,18 +2728,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 
 	namespace_lock();
 
-	err = -EINVAL;
-	/* To and From must be mounted */
-	if (!is_mounted(&from->mnt))
-		goto out;
-	if (!is_mounted(&to->mnt))
-		goto out;
-
-	err = -EPERM;
-	/* We should be allowed to modify mount namespaces of both mounts */
-	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(from);
+	if (err)
 		goto out;
-	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(to);
+	if (err)
 		goto out;
 
 	err = -EINVAL;