Skip to content

Commit 64cf7d0

Browse files
committed
tracing: Have trace_marker use per-cpu data to read user space
It was reported that using __copy_from_user_inatomic() can actually schedule. Which is bad when preemption is disabled. Even though there's logic to check in_atomic() is set, but this is a nop when the kernel is configured with PREEMPT_NONE. This is due to page faulting and the code could schedule with preemption disabled. Link: https://lore.kernel.org/all/20250819105152.2766363-1-luogengkun@huaweicloud.com/ The solution was to change the __copy_from_user_inatomic() to copy_from_user_nofault(). But then it was reported that this caused a regression in Android. There's several applications writing into trace_marker() in Android, but now instead of showing the expected data, it is showing: tracing_mark_write: <faulted> After reverting the conversion to copy_from_user_nofault(), Android was able to get the data again. Writes to the trace_marker is a way to efficiently and quickly enter data into the Linux tracing buffer. It takes no locks and was designed to be as non-intrusive as possible. This means it cannot allocate memory, and must use pre-allocated data. A method that is actively being worked on to have faultable system call tracepoints read user space data is to allocate per CPU buffers, and use them in the callback. The method uses a technique similar to seqcount. That is something like this: preempt_disable(); cpu = smp_processor_id(); buffer = this_cpu_ptr(&pre_allocated_cpu_buffers, cpu); do { cnt = nr_context_switches_cpu(cpu); migrate_disable(); preempt_enable(); ret = copy_from_user(buffer, ptr, size); preempt_disable(); migrate_enable(); } while (!ret && cnt != nr_context_switches_cpu(cpu)); if (!ret) ring_buffer_write(buffer); preempt_enable(); It's a little more involved than that, but the above is the basic logic. The idea is to acquire the current CPU buffer, disable migration, and then enable preemption. At this moment, it can safely use copy_from_user(). After reading the data from user space, it disables preemption again. It then checks to see if there was any new scheduling on this CPU. If there was, it must assume that the buffer was corrupted by another task. If there wasn't, then the buffer is still valid as only tasks in preemptable context can write to this buffer and only those that are running on the CPU. By using this method, where trace_marker open allocates the per CPU buffers, trace_marker writes can access user space and even fault it in, without having to allocate or take any locks of its own. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Luo Gengkun <luogengkun@huaweicloud.com> Cc: Wattson CI <wattson-external@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: https://lore.kernel.org/20251008124510.6dba541a@gandalf.local.home Fixes: 3d62ab3 ("tracing: Fix tracing_marker may trigger page fault during preempt_disable") Reported-by: Runping Lai <runpinglai@google.com> Tested-by: Runping Lai <runpinglai@google.com> Closes: https://lore.kernel.org/linux-trace-kernel/20251007003417.3470979-2-runpinglai@google.com/ Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
1 parent de4cbd7 commit 64cf7d0

File tree

1 file changed

+220
-48
lines changed

1 file changed

+220
-48
lines changed

kernel/trace/trace.c

Lines changed: 220 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -4791,12 +4791,6 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
47914791
return single_release(inode, filp);
47924792
}
47934793

4794-
static int tracing_mark_open(struct inode *inode, struct file *filp)
4795-
{
4796-
stream_open(inode, filp);
4797-
return tracing_open_generic_tr(inode, filp);
4798-
}
4799-
48004794
static int tracing_release(struct inode *inode, struct file *file)
48014795
{
48024796
struct trace_array *tr = inode->i_private;
@@ -7163,7 +7157,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
71637157

71647158
#define TRACE_MARKER_MAX_SIZE 4096
71657159

7166-
static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf,
7160+
static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf,
71677161
size_t cnt, unsigned long ip)
71687162
{
71697163
struct ring_buffer_event *event;
@@ -7173,20 +7167,11 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
71737167
int meta_size;
71747168
ssize_t written;
71757169
size_t size;
7176-
int len;
7177-
7178-
/* Used in tracing_mark_raw_write() as well */
7179-
#define FAULTED_STR "<faulted>"
7180-
#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
71817170

71827171
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
71837172
again:
71847173
size = cnt + meta_size;
71857174

7186-
/* If less than "<faulted>", then make sure we can still add that */
7187-
if (cnt < FAULTED_SIZE)
7188-
size += FAULTED_SIZE - cnt;
7189-
71907175
buffer = tr->array_buffer.buffer;
71917176
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
71927177
tracing_gen_ctx());
@@ -7196,9 +7181,6 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
71967181
* make it smaller and try again.
71977182
*/
71987183
if (size > ring_buffer_max_event_size(buffer)) {
7199-
/* cnt < FAULTED size should never be bigger than max */
7200-
if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
7201-
return -EBADF;
72027184
cnt = ring_buffer_max_event_size(buffer) - meta_size;
72037185
/* The above should only happen once */
72047186
if (WARN_ON_ONCE(cnt + meta_size == size))
@@ -7212,14 +7194,8 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
72127194

72137195
entry = ring_buffer_event_data(event);
72147196
entry->ip = ip;
7215-
7216-
len = copy_from_user_nofault(&entry->buf, ubuf, cnt);
7217-
if (len) {
7218-
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
7219-
cnt = FAULTED_SIZE;
7220-
written = -EFAULT;
7221-
} else
7222-
written = cnt;
7197+
memcpy(&entry->buf, buf, cnt);
7198+
written = cnt;
72237199

72247200
if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
72257201
/* do not add \n before testing triggers, but add \0 */
@@ -7243,13 +7219,178 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
72437219
return written;
72447220
}
72457221

7222+
struct trace_user_buf {
7223+
char *buf;
7224+
};
7225+
7226+
struct trace_user_buf_info {
7227+
struct trace_user_buf __percpu *tbuf;
7228+
int ref;
7229+
};
7230+
7231+
7232+
static DEFINE_MUTEX(trace_user_buffer_mutex);
7233+
static struct trace_user_buf_info *trace_user_buffer;
7234+
7235+
static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo)
7236+
{
7237+
char *buf;
7238+
int cpu;
7239+
7240+
for_each_possible_cpu(cpu) {
7241+
buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
7242+
kfree(buf);
7243+
}
7244+
free_percpu(tinfo->tbuf);
7245+
kfree(tinfo);
7246+
}
7247+
7248+
static int trace_user_fault_buffer_enable(void)
7249+
{
7250+
struct trace_user_buf_info *tinfo;
7251+
char *buf;
7252+
int cpu;
7253+
7254+
guard(mutex)(&trace_user_buffer_mutex);
7255+
7256+
if (trace_user_buffer) {
7257+
trace_user_buffer->ref++;
7258+
return 0;
7259+
}
7260+
7261+
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
7262+
if (!tinfo)
7263+
return -ENOMEM;
7264+
7265+
tinfo->tbuf = alloc_percpu(struct trace_user_buf);
7266+
if (!tinfo->tbuf) {
7267+
kfree(tinfo);
7268+
return -ENOMEM;
7269+
}
7270+
7271+
tinfo->ref = 1;
7272+
7273+
/* Clear each buffer in case of error */
7274+
for_each_possible_cpu(cpu) {
7275+
per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL;
7276+
}
7277+
7278+
for_each_possible_cpu(cpu) {
7279+
buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL,
7280+
cpu_to_node(cpu));
7281+
if (!buf) {
7282+
trace_user_fault_buffer_free(tinfo);
7283+
return -ENOMEM;
7284+
}
7285+
per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf;
7286+
}
7287+
7288+
trace_user_buffer = tinfo;
7289+
7290+
return 0;
7291+
}
7292+
7293+
static void trace_user_fault_buffer_disable(void)
7294+
{
7295+
struct trace_user_buf_info *tinfo;
7296+
7297+
guard(mutex)(&trace_user_buffer_mutex);
7298+
7299+
tinfo = trace_user_buffer;
7300+
7301+
if (WARN_ON_ONCE(!tinfo))
7302+
return;
7303+
7304+
if (--tinfo->ref)
7305+
return;
7306+
7307+
trace_user_fault_buffer_free(tinfo);
7308+
trace_user_buffer = NULL;
7309+
}
7310+
7311+
/* Must be called with preemption disabled */
7312+
static char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
7313+
const char __user *ptr, size_t size,
7314+
size_t *read_size)
7315+
{
7316+
int cpu = smp_processor_id();
7317+
char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
7318+
unsigned int cnt;
7319+
int trys = 0;
7320+
int ret;
7321+
7322+
if (size > TRACE_MARKER_MAX_SIZE)
7323+
size = TRACE_MARKER_MAX_SIZE;
7324+
*read_size = 0;
7325+
7326+
/*
7327+
* This acts similar to a seqcount. The per CPU context switches are
7328+
* recorded, migration is disabled and preemption is enabled. The
7329+
* read of the user space memory is copied into the per CPU buffer.
7330+
* Preemption is disabled again, and if the per CPU context switches count
7331+
* is still the same, it means the buffer has not been corrupted.
7332+
* If the count is different, it is assumed the buffer is corrupted
7333+
* and reading must be tried again.
7334+
*/
7335+
7336+
do {
7337+
/*
7338+
* If for some reason, copy_from_user() always causes a context
7339+
* switch, this would then cause an infinite loop.
7340+
* If this task is preempted by another user space task, it
7341+
* will cause this task to try again. But just in case something
7342+
* changes where the copying from user space causes another task
7343+
* to run, prevent this from going into an infinite loop.
7344+
* 100 tries should be plenty.
7345+
*/
7346+
if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space"))
7347+
return NULL;
7348+
7349+
/* Read the current CPU context switch counter */
7350+
cnt = nr_context_switches_cpu(cpu);
7351+
7352+
/*
7353+
* Preemption is going to be enabled, but this task must
7354+
* remain on this CPU.
7355+
*/
7356+
migrate_disable();
7357+
7358+
/*
7359+
* Now preemption is being enabed and another task can come in
7360+
* and use the same buffer and corrupt our data.
7361+
*/
7362+
preempt_enable_notrace();
7363+
7364+
ret = __copy_from_user(buffer, ptr, size);
7365+
7366+
preempt_disable_notrace();
7367+
migrate_enable();
7368+
7369+
/* if it faulted, no need to test if the buffer was corrupted */
7370+
if (ret)
7371+
return NULL;
7372+
7373+
/*
7374+
* Preemption is disabled again, now check the per CPU context
7375+
* switch counter. If it doesn't match, then another user space
7376+
* process may have schedule in and corrupted our buffer. In that
7377+
* case the copying must be retried.
7378+
*/
7379+
} while (nr_context_switches_cpu(cpu) != cnt);
7380+
7381+
*read_size = size;
7382+
return buffer;
7383+
}
7384+
72467385
static ssize_t
72477386
tracing_mark_write(struct file *filp, const char __user *ubuf,
72487387
size_t cnt, loff_t *fpos)
72497388
{
72507389
struct trace_array *tr = filp->private_data;
72517390
ssize_t written = -ENODEV;
72527391
unsigned long ip;
7392+
size_t size;
7393+
char *buf;
72537394

72547395
if (tracing_disabled)
72557396
return -EINVAL;
@@ -7263,39 +7404,44 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
72637404
if (cnt > TRACE_MARKER_MAX_SIZE)
72647405
cnt = TRACE_MARKER_MAX_SIZE;
72657406

7407+
/* Must have preemption disabled while having access to the buffer */
7408+
guard(preempt_notrace)();
7409+
7410+
buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size);
7411+
if (!buf)
7412+
return -EFAULT;
7413+
7414+
if (cnt > size)
7415+
cnt = size;
7416+
72667417
/* The selftests expect this function to be the IP address */
72677418
ip = _THIS_IP_;
72687419

72697420
/* The global trace_marker can go to multiple instances */
72707421
if (tr == &global_trace) {
72717422
guard(rcu)();
72727423
list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
7273-
written = write_marker_to_buffer(tr, ubuf, cnt, ip);
7424+
written = write_marker_to_buffer(tr, buf, cnt, ip);
72747425
if (written < 0)
72757426
break;
72767427
}
72777428
} else {
7278-
written = write_marker_to_buffer(tr, ubuf, cnt, ip);
7429+
written = write_marker_to_buffer(tr, buf, cnt, ip);
72797430
}
72807431

72817432
return written;
72827433
}
72837434

72847435
static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
7285-
const char __user *ubuf, size_t cnt)
7436+
const char *buf, size_t cnt)
72867437
{
72877438
struct ring_buffer_event *event;
72887439
struct trace_buffer *buffer;
72897440
struct raw_data_entry *entry;
72907441
ssize_t written;
7291-
int size;
7292-
int len;
7293-
7294-
#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
7442+
size_t size;
72957443

72967444
size = sizeof(*entry) + cnt;
7297-
if (cnt < FAULT_SIZE_ID)
7298-
size += FAULT_SIZE_ID - cnt;
72997445

73007446
buffer = tr->array_buffer.buffer;
73017447

@@ -7309,14 +7455,8 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
73097455
return -EBADF;
73107456

73117457
entry = ring_buffer_event_data(event);
7312-
7313-
len = copy_from_user_nofault(&entry->id, ubuf, cnt);
7314-
if (len) {
7315-
entry->id = -1;
7316-
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
7317-
written = -EFAULT;
7318-
} else
7319-
written = cnt;
7458+
memcpy(&entry->id, buf, cnt);
7459+
written = cnt;
73207460

73217461
__buffer_unlock_commit(buffer, event);
73227462

@@ -7329,8 +7469,8 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
73297469
{
73307470
struct trace_array *tr = filp->private_data;
73317471
ssize_t written = -ENODEV;
7332-
7333-
#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
7472+
size_t size;
7473+
char *buf;
73347474

73357475
if (tracing_disabled)
73367476
return -EINVAL;
@@ -7342,6 +7482,17 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
73427482
if (cnt < sizeof(unsigned int))
73437483
return -EINVAL;
73447484

7485+
/* Must have preemption disabled while having access to the buffer */
7486+
guard(preempt_notrace)();
7487+
7488+
buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size);
7489+
if (!buf)
7490+
return -EFAULT;
7491+
7492+
/* raw write is all or nothing */
7493+
if (cnt > size)
7494+
return -EINVAL;
7495+
73457496
/* The global trace_marker_raw can go to multiple instances */
73467497
if (tr == &global_trace) {
73477498
guard(rcu)();
@@ -7357,6 +7508,27 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
73577508
return written;
73587509
}
73597510

7511+
static int tracing_mark_open(struct inode *inode, struct file *filp)
7512+
{
7513+
int ret;
7514+
7515+
ret = trace_user_fault_buffer_enable();
7516+
if (ret < 0)
7517+
return ret;
7518+
7519+
stream_open(inode, filp);
7520+
ret = tracing_open_generic_tr(inode, filp);
7521+
if (ret < 0)
7522+
trace_user_fault_buffer_disable();
7523+
return ret;
7524+
}
7525+
7526+
static int tracing_mark_release(struct inode *inode, struct file *file)
7527+
{
7528+
trace_user_fault_buffer_disable();
7529+
return tracing_release_generic_tr(inode, file);
7530+
}
7531+
73607532
static int tracing_clock_show(struct seq_file *m, void *v)
73617533
{
73627534
struct trace_array *tr = m->private;
@@ -7764,13 +7936,13 @@ static const struct file_operations tracing_free_buffer_fops = {
77647936
static const struct file_operations tracing_mark_fops = {
77657937
.open = tracing_mark_open,
77667938
.write = tracing_mark_write,
7767-
.release = tracing_release_generic_tr,
7939+
.release = tracing_mark_release,
77687940
};
77697941

77707942
static const struct file_operations tracing_mark_raw_fops = {
77717943
.open = tracing_mark_open,
77727944
.write = tracing_mark_raw_write,
7773-
.release = tracing_release_generic_tr,
7945+
.release = tracing_mark_release,
77747946
};
77757947

77767948
static const struct file_operations trace_clock_fops = {

0 commit comments

Comments
 (0)