Skip to content

Commit 5fb750e

Browse files
Alexei Starovoitovborkmann
authored andcommitted
bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
The following kmemleak splat: [ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black [ 8.106521] Call Trace: [ 8.106521] <TASK> [ 8.106521] dump_stack_lvl+0x4b/0x70 [ 8.106521] kvfree_call_rcu+0xcb/0x3b0 [ 8.106521] ? hrtimer_cancel+0x21/0x40 [ 8.106521] bpf_obj_free_fields+0x193/0x200 [ 8.106521] htab_map_update_elem+0x29c/0x410 [ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b [ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86 [ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0 happens due to the combination of features and fixes, but mainly due to commit 6d78b44 ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()") It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()-> kvfree_call_rcu()->kmemleak_ignore() complains with the above splat. To fix this imbalance, replace bpf_map_kmalloc_node() with kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to make sure that the objects allocated with kmalloc_nolock() are freed with kfree_nolock() rather than the implicit kfree() that kfree_rcu() uses internally. Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so it will always fail in PREEMPT_RT. This is not an issue at the moment, since bpf_timers are disabled in PREEMPT_RT. In the future bpf_spin_lock will be replaced with state machine similar to bpf_task_work. Fixes: 6d78b44 ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()") Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: Harry Yoo <harry.yoo@oracle.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20251015000700.28988-1-alexei.starovoitov@gmail.com
1 parent e603a34 commit 5fb750e

File tree

3 files changed

+33
-11
lines changed

3 files changed

+33
-11
lines changed

include/linux/bpf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2499,6 +2499,8 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
24992499
#ifdef CONFIG_MEMCG
25002500
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
25012501
int node);
2502+
void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
2503+
int node);
25022504
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
25032505
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
25042506
gfp_t flags);
@@ -2511,6 +2513,8 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
25112513
*/
25122514
#define bpf_map_kmalloc_node(_map, _size, _flags, _node) \
25132515
kmalloc_node(_size, _flags, _node)
2516+
#define bpf_map_kmalloc_nolock(_map, _size, _flags, _node) \
2517+
kmalloc_nolock(_size, _flags, _node)
25142518
#define bpf_map_kzalloc(_map, _size, _flags) \
25152519
kzalloc(_size, _flags)
25162520
#define bpf_map_kvcalloc(_map, _n, _size, _flags) \

kernel/bpf/helpers.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work)
12151215
rcu_read_unlock_trace();
12161216
}
12171217

1218+
static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
1219+
{
1220+
struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
1221+
1222+
kfree_nolock(cb);
1223+
}
1224+
12181225
static void bpf_wq_delete_work(struct work_struct *work)
12191226
{
12201227
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
12211228

12221229
cancel_work_sync(&w->work);
12231230

1224-
kfree_rcu(w, cb.rcu);
1231+
call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
12251232
}
12261233

12271234
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
12301237

12311238
/* Cancel the timer and wait for callback to complete if it was running.
12321239
* If hrtimer_cancel() can be safely called it's safe to call
1233-
* kfree_rcu(t) right after for both preallocated and non-preallocated
1240+
* call_rcu() right after for both preallocated and non-preallocated
12341241
* maps. The async->cb = NULL was already done and no code path can see
12351242
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
12361243
* bpf_timer_cancel_and_free will have been cancelled.
12371244
*/
12381245
hrtimer_cancel(&t->timer);
1239-
kfree_rcu(t, cb.rcu);
1246+
call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
12401247
}
12411248

12421249
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
12701277
goto out;
12711278
}
12721279

1273-
/* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
1274-
* kmalloc_nolock() is available, avoid locking issues by using
1275-
* __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
1276-
*/
1277-
cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
1280+
cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
12781281
if (!cb) {
12791282
ret = -ENOMEM;
12801283
goto out;
@@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
13151318
* or pinned in bpffs.
13161319
*/
13171320
WRITE_ONCE(async->cb, NULL);
1318-
kfree(cb);
1321+
kfree_nolock(cb);
13191322
ret = -EPERM;
13201323
}
13211324
out:
@@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val)
15801583
* timer _before_ calling us, such that failing to cancel it here will
15811584
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
15821585
* Therefore, we _need_ to cancel any outstanding timers before we do
1583-
* kfree_rcu, even though no more timers can be armed.
1586+
* call_rcu, even though no more timers can be armed.
15841587
*
15851588
* Moreover, we need to schedule work even if timer does not belong to
15861589
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val)
16071610
* completion.
16081611
*/
16091612
if (hrtimer_try_to_cancel(&t->timer) >= 0)
1610-
kfree_rcu(t, cb.rcu);
1613+
call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
16111614
else
16121615
queue_work(system_dfl_wq, &t->cb.delete_work);
16131616
} else {

kernel/bpf/syscall.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
520520
return ptr;
521521
}
522522

523+
void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
524+
int node)
525+
{
526+
struct mem_cgroup *memcg, *old_memcg;
527+
void *ptr;
528+
529+
memcg = bpf_map_get_memcg(map);
530+
old_memcg = set_active_memcg(memcg);
531+
ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
532+
set_active_memcg(old_memcg);
533+
mem_cgroup_put(memcg);
534+
535+
return ptr;
536+
}
537+
523538
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
524539
{
525540
struct mem_cgroup *memcg, *old_memcg;

0 commit comments

Comments
 (0)