Skip to content

Commit fa1827f

Browse files
Frederic Weisbeckergregkh
authored andcommitted
perf: Fix hang while freeing sigtrap event
[ Upstream commit 56799bc ] Perf can hang while freeing a sigtrap event if a related deferred signal hadn't managed to be sent before the file got closed: perf_event_overflow() task_work_add(perf_pending_task) fput() task_work_add(____fput()) task_work_run() ____fput() perf_release() perf_event_release_kernel() _free_event() perf_pending_task_sync() task_work_cancel() -> FAILED rcuwait_wait_event() Once task_work_run() is running, the list of pending callbacks is removed from the task_struct and from this point on task_work_cancel() can't remove any pending and not yet started work items, hence the task_work_cancel() failure and the hang on rcuwait_wait_event(). Task work could be changed to remove one work at a time, so a work running on the current task can always cancel a pending one, however the wait / wake design is still subject to inverted dependencies when remote targets are involved, as pictured by Oleg: T1 T2 fd = perf_event_open(pid => T2->pid); fd = perf_event_open(pid => T1->pid); close(fd) close(fd) <IRQ> <IRQ> perf_event_overflow() perf_event_overflow() task_work_add(perf_pending_task) task_work_add(perf_pending_task) </IRQ> </IRQ> fput() fput() task_work_add(____fput()) task_work_add(____fput()) task_work_run() task_work_run() ____fput() ____fput() perf_release() perf_release() perf_event_release_kernel() perf_event_release_kernel() _free_event() _free_event() perf_pending_task_sync() perf_pending_task_sync() rcuwait_wait_event() rcuwait_wait_event() Therefore the only option left is to acquire the event reference count upon queueing the perf task work and release it from the task work, just like it was done before 3a54654 ("perf: Fix event leak upon exec and file release") but without the leaks it fixed. Some adjustments are necessary to make it work: * A child event might dereference its parent upon freeing. Care must be taken to release the parent last. * Some places assuming the event doesn't have any reference held and therefore can be freed right away must instead put the reference and let the reference counting to its job. Reported-by: "Yi Lai" <yi1.lai@linux.intel.com> Closes: https://lore.kernel.org/all/Zx9Losv4YcJowaP%2F@ly-workstation/ Reported-by: syzbot+3c4321e10eea460eb606@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/673adf75.050a0220.87769.0024.GAE@google.com/ Fixes: 3a54654 ("perf: Fix event leak upon exec and file release") Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20250304135446.18905-1-frederic@kernel.org Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 7ef5aa0 commit fa1827f

File tree

2 files changed

+18
-47
lines changed

2 files changed

+18
-47
lines changed

include/linux/perf_event.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,6 @@ struct perf_event {
833833
struct irq_work pending_disable_irq;
834834
struct callback_head pending_task;
835835
unsigned int pending_work;
836-
struct rcuwait pending_work_wait;
837836

838837
atomic_t event_limit;
839838

kernel/events/core.c

Lines changed: 18 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5312,30 +5312,6 @@ static bool exclusive_event_installable(struct perf_event *event,
53125312
static void perf_addr_filters_splice(struct perf_event *event,
53135313
struct list_head *head);
53145314

5315-
static void perf_pending_task_sync(struct perf_event *event)
5316-
{
5317-
struct callback_head *head = &event->pending_task;
5318-
5319-
if (!event->pending_work)
5320-
return;
5321-
/*
5322-
* If the task is queued to the current task's queue, we
5323-
* obviously can't wait for it to complete. Simply cancel it.
5324-
*/
5325-
if (task_work_cancel(current, head)) {
5326-
event->pending_work = 0;
5327-
local_dec(&event->ctx->nr_no_switch_fast);
5328-
return;
5329-
}
5330-
5331-
/*
5332-
* All accesses related to the event are within the same RCU section in
5333-
* perf_pending_task(). The RCU grace period before the event is freed
5334-
* will make sure all those accesses are complete by then.
5335-
*/
5336-
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
5337-
}
5338-
53395315
/* vs perf_event_alloc() error */
53405316
static void __free_event(struct perf_event *event)
53415317
{
@@ -5388,7 +5364,6 @@ static void _free_event(struct perf_event *event)
53885364
{
53895365
irq_work_sync(&event->pending_irq);
53905366
irq_work_sync(&event->pending_disable_irq);
5391-
perf_pending_task_sync(event);
53925367

53935368
unaccount_event(event);
53945369

@@ -5481,10 +5456,17 @@ static void perf_remove_from_owner(struct perf_event *event)
54815456

54825457
static void put_event(struct perf_event *event)
54835458
{
5459+
struct perf_event *parent;
5460+
54845461
if (!atomic_long_dec_and_test(&event->refcount))
54855462
return;
54865463

5464+
parent = event->parent;
54875465
_free_event(event);
5466+
5467+
/* Matches the refcount bump in inherit_event() */
5468+
if (parent)
5469+
put_event(parent);
54885470
}
54895471

54905472
/*
@@ -5568,11 +5550,6 @@ int perf_event_release_kernel(struct perf_event *event)
55685550
if (tmp == child) {
55695551
perf_remove_from_context(child, DETACH_GROUP);
55705552
list_move(&child->child_list, &free_list);
5571-
/*
5572-
* This matches the refcount bump in inherit_event();
5573-
* this can't be the last reference.
5574-
*/
5575-
put_event(event);
55765553
} else {
55775554
var = &ctx->refcount;
55785555
}
@@ -5598,7 +5575,8 @@ int perf_event_release_kernel(struct perf_event *event)
55985575
void *var = &child->ctx->refcount;
55995576

56005577
list_del(&child->child_list);
5601-
free_event(child);
5578+
/* Last reference unless ->pending_task work is pending */
5579+
put_event(child);
56025580

56035581
/*
56045582
* Wake any perf_event_free_task() waiting for this event to be
@@ -5609,7 +5587,11 @@ int perf_event_release_kernel(struct perf_event *event)
56095587
}
56105588

56115589
no_ctx:
5612-
put_event(event); /* Must be the 'last' reference */
5590+
/*
5591+
* Last reference unless ->pending_task work is pending on this event
5592+
* or any of its children.
5593+
*/
5594+
put_event(event);
56135595
return 0;
56145596
}
56155597
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -6994,12 +6976,6 @@ static void perf_pending_task(struct callback_head *head)
69946976
struct perf_event *event = container_of(head, struct perf_event, pending_task);
69956977
int rctx;
69966978

6997-
/*
6998-
* All accesses to the event must belong to the same implicit RCU read-side
6999-
* critical section as the ->pending_work reset. See comment in
7000-
* perf_pending_task_sync().
7001-
*/
7002-
rcu_read_lock();
70036979
/*
70046980
* If we 'fail' here, that's OK, it means recursion is already disabled
70056981
* and we won't recurse 'further'.
@@ -7010,9 +6986,8 @@ static void perf_pending_task(struct callback_head *head)
70106986
event->pending_work = 0;
70116987
perf_sigtrap(event);
70126988
local_dec(&event->ctx->nr_no_switch_fast);
7013-
rcuwait_wake_up(&event->pending_work_wait);
70146989
}
7015-
rcu_read_unlock();
6990+
put_event(event);
70166991

70176992
if (rctx >= 0)
70186993
perf_swevent_put_recursion_context(rctx);
@@ -9935,6 +9910,7 @@ static int __perf_event_overflow(struct perf_event *event,
99359910
!task_work_add(current, &event->pending_task, notify_mode)) {
99369911
event->pending_work = pending_id;
99379912
local_inc(&event->ctx->nr_no_switch_fast);
9913+
WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
99389914

99399915
event->pending_addr = 0;
99409916
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -12283,7 +12259,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1228312259
init_irq_work(&event->pending_irq, perf_pending_irq);
1228412260
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
1228512261
init_task_work(&event->pending_task, perf_pending_task);
12286-
rcuwait_init(&event->pending_work_wait);
1228712262

1228812263
mutex_init(&event->mmap_mutex);
1228912264
raw_spin_lock_init(&event->addr_filters.lock);
@@ -13426,8 +13401,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
1342613401
* Kick perf_poll() for is_event_hup();
1342713402
*/
1342813403
perf_event_wakeup(parent_event);
13429-
free_event(event);
13430-
put_event(parent_event);
13404+
put_event(event);
1343113405
return;
1343213406
}
1343313407

@@ -13545,13 +13519,11 @@ static void perf_free_event(struct perf_event *event,
1354513519
list_del_init(&event->child_list);
1354613520
mutex_unlock(&parent->child_mutex);
1354713521

13548-
put_event(parent);
13549-
1355013522
raw_spin_lock_irq(&ctx->lock);
1355113523
perf_group_detach(event);
1355213524
list_del_event(event, ctx);
1355313525
raw_spin_unlock_irq(&ctx->lock);
13554-
free_event(event);
13526+
put_event(event);
1355513527
}
1355613528

1355713529
/*

0 commit comments

Comments
 (0)