Skip to content

Commit e32ba89

Browse files
author
Herton R. Krzesinski
committed
Merge: fs/exec: switch timens when a task gets a new mm
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/1864 fs/exec: switch timens when a task gets a new mm Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2116442 commit 2b5f9da Author: Andrei Vagin <avagin@gmail.com> Date: Tue Sep 20 17:31:19 2022 -0700 fs/exec: switch timens when a task gets a new mm Changing a time namespace requires remapping a vvar page, so we don't want to allow doing that if any other tasks can use the same mm. Currently, we install a time namespace when a task is created with a new vm. exec() is another case when a task gets a new mm and so it can switch a time namespace safely, but it isn't handled now. One more issue of the current interface is that clone() with CLONE_VM isn't allowed if the current task has unshared a time namespace (timens_for_children doesn't match the current timens). Both these issues make some inconvenience for users. For example, Alexey and Florian reported that posix_spawn() uses vfork+exec and this pattern doesn't work with time namespaces due to the both described issues. LXC needed to workaround the exec() issue by calling setns. In the commit 133e2d3 ("fs/exec: allow to unshare a time namespace on vfork+exec"), we tried to fix these issues with minimal impact on UAPI. But it adds extra complexity and some undesirable side effects. Eric suggested fixing the issues properly because here are all the reasons to suppose that there are no users that depend on the old behavior. Cc: Alexey Izbyshev <izbyshev@ispras.ru> Cc: Christian Brauner <brauner@kernel.org> Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Florian Weimer <fweimer@redhat.com> Cc: Kees Cook <keescook@chromium.org> Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com> Origin-author: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Andrei Vagin <avagin@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220921003120.209637-1-avagin@google.com Signed-off-by: Oleg Nesterov <oleg@redhat.com> Approved-by: Alexey Gladkov <agladkov@redhat.com> Approved-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
2 parents 390e508 + 29b9b12 commit e32ba89

File tree

7 files changed

+168
-12
lines changed

7 files changed

+168
-12
lines changed

fs/exec.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
#include <linux/vmalloc.h>
6666
#include <linux/io_uring.h>
6767
#include <linux/syscall_user_dispatch.h>
68+
#include <linux/time_namespace.h>
6869

6970
#include <linux/uaccess.h>
7071
#include <asm/mmu_context.h>
@@ -1299,6 +1300,10 @@ int begin_new_exec(struct linux_binprm * bprm)
12991300

13001301
bprm->mm = NULL;
13011302

1303+
retval = exec_task_namespaces();
1304+
if (retval)
1305+
goto out_unlock;
1306+
13021307
#ifdef CONFIG_POSIX_TIMERS
13031308
spin_lock_irq(&me->sighand->siglock);
13041309
posix_cpu_timers_exit(me);

include/linux/nsproxy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ static inline struct cred *nsset_cred(struct nsset *set)
9494
int copy_namespaces(unsigned long flags, struct task_struct *tsk);
9595
void exit_task_namespaces(struct task_struct *tsk);
9696
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
97+
int exec_task_namespaces(void);
9798
void free_nsproxy(struct nsproxy *ns);
9899
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
99100
struct cred *, struct fs_struct *);

kernel/fork.c

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,15 +2042,6 @@ static __latent_entropy struct task_struct *copy_process(
20422042
return ERR_PTR(-EINVAL);
20432043
}
20442044

2045-
/*
2046-
* If the new process will be in a different time namespace
2047-
* do not allow it to share VM or a thread group with the forking task.
2048-
*/
2049-
if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
2050-
if (nsp->time_ns != nsp->time_ns_for_children)
2051-
return ERR_PTR(-EINVAL);
2052-
}
2053-
20542045
if (clone_flags & CLONE_PIDFD) {
20552046
/*
20562047
* - CLONE_DETACHED is blocked so that we can potentially

kernel/nsproxy.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
157157
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
158158
CLONE_NEWPID | CLONE_NEWNET |
159159
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
160-
if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
160+
if ((flags & CLONE_VM) ||
161+
likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
161162
get_nsproxy(old_ns);
162163
return 0;
163164
}
@@ -179,7 +180,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
179180
if (IS_ERR(new_ns))
180181
return PTR_ERR(new_ns);
181182

182-
timens_on_fork(new_ns, tsk);
183+
if ((flags & CLONE_VM) == 0)
184+
timens_on_fork(new_ns, tsk);
183185

184186
tsk->nsproxy = new_ns;
185187
return 0;
@@ -254,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p)
254256
switch_task_namespaces(p, NULL);
255257
}
256258

259+
int exec_task_namespaces(void)
260+
{
261+
struct task_struct *tsk = current;
262+
struct nsproxy *new;
263+
264+
if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
265+
return 0;
266+
267+
new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
268+
if (IS_ERR(new))
269+
return PTR_ERR(new);
270+
271+
timens_on_fork(new, tsk);
272+
switch_task_namespaces(tsk, new);
273+
return 0;
274+
}
275+
257276
static int check_setns_flags(unsigned long flags)
258277
{
259278
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |

tools/testing/selftests/timens/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ procfs
88
timens
99
timer
1010
timerfd
11+
vfork_exec

tools/testing/selftests/timens/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex
1+
TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex vfork_exec
22
TEST_GEN_PROGS_EXTENDED := gettime_perf
33

44
CFLAGS := -Wall -Werror -pthread
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#define _GNU_SOURCE
3+
#include <errno.h>
4+
#include <fcntl.h>
5+
#include <sched.h>
6+
#include <stdio.h>
7+
#include <stdbool.h>
8+
#include <sys/stat.h>
9+
#include <sys/syscall.h>
10+
#include <sys/types.h>
11+
#include <sys/wait.h>
12+
#include <time.h>
13+
#include <unistd.h>
14+
#include <string.h>
15+
#include <pthread.h>
16+
17+
#include "log.h"
18+
#include "timens.h"
19+
20+
#define OFFSET (36000)
21+
22+
struct thread_args {
23+
char *tst_name;
24+
struct timespec *now;
25+
};
26+
27+
static void *tcheck(void *_args)
28+
{
29+
struct thread_args *args = _args;
30+
struct timespec *now = args->now, tst;
31+
int i;
32+
33+
for (i = 0; i < 2; i++) {
34+
_gettime(CLOCK_MONOTONIC, &tst, i);
35+
if (abs(tst.tv_sec - now->tv_sec) > 5) {
36+
pr_fail("%s: in-thread: unexpected value: %ld (%ld)\n",
37+
args->tst_name, tst.tv_sec, now->tv_sec);
38+
return (void *)1UL;
39+
}
40+
}
41+
return NULL;
42+
}
43+
44+
static int check_in_thread(char *tst_name, struct timespec *now)
45+
{
46+
struct thread_args args = {
47+
.tst_name = tst_name,
48+
.now = now,
49+
};
50+
pthread_t th;
51+
void *retval;
52+
53+
if (pthread_create(&th, NULL, tcheck, &args))
54+
return pr_perror("thread");
55+
if (pthread_join(th, &retval))
56+
return pr_perror("pthread_join");
57+
return !(retval == NULL);
58+
}
59+
60+
static int check(char *tst_name, struct timespec *now)
61+
{
62+
struct timespec tst;
63+
int i;
64+
65+
for (i = 0; i < 2; i++) {
66+
_gettime(CLOCK_MONOTONIC, &tst, i);
67+
if (abs(tst.tv_sec - now->tv_sec) > 5)
68+
return pr_fail("%s: unexpected value: %ld (%ld)\n",
69+
tst_name, tst.tv_sec, now->tv_sec);
70+
}
71+
if (check_in_thread(tst_name, now))
72+
return 1;
73+
ksft_test_result_pass("%s\n", tst_name);
74+
return 0;
75+
}
76+
77+
int main(int argc, char *argv[])
78+
{
79+
struct timespec now;
80+
int status;
81+
pid_t pid;
82+
83+
if (argc > 1) {
84+
char *endptr;
85+
86+
ksft_cnt.ksft_pass = 1;
87+
now.tv_sec = strtoul(argv[1], &endptr, 0);
88+
if (*endptr != 0)
89+
return pr_perror("strtoul");
90+
91+
return check("child after exec", &now);
92+
}
93+
94+
nscheck();
95+
96+
ksft_set_plan(4);
97+
98+
clock_gettime(CLOCK_MONOTONIC, &now);
99+
100+
if (unshare_timens())
101+
return 1;
102+
103+
if (_settime(CLOCK_MONOTONIC, OFFSET))
104+
return 1;
105+
106+
if (check("parent before vfork", &now))
107+
return 1;
108+
109+
pid = vfork();
110+
if (pid < 0)
111+
return pr_perror("fork");
112+
113+
if (pid == 0) {
114+
char now_str[64];
115+
char *cargv[] = {"exec", now_str, NULL};
116+
char *cenv[] = {NULL};
117+
118+
/* Check for proper vvar offsets after execve. */
119+
snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET);
120+
execve("/proc/self/exe", cargv, cenv);
121+
pr_perror("execve");
122+
_exit(1);
123+
}
124+
125+
if (waitpid(pid, &status, 0) != pid)
126+
return pr_perror("waitpid");
127+
128+
if (status)
129+
ksft_exit_fail();
130+
ksft_inc_pass_cnt();
131+
ksft_test_result_pass("wait for child\n");
132+
133+
/* Check that we are still in the source timens. */
134+
if (check("parent after vfork", &now))
135+
return 1;
136+
137+
ksft_exit_pass();
138+
return 0;
139+
}

0 commit comments

Comments
 (0)