Skip to content

Commit fd57572

Browse files
committed
Merge tag 'sched_ext-for-6.18-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix scx_kick_pseqs corruption when multiple schedulers are loaded concurrently - Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() to handle systems with large CPU counts - Defer queue_balance_callback() until after ops.dispatch to fix callback ordering issues - Sync error_irq_work before freeing scx_sched to prevent use-after-free - Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU for proper RCU protection - Fix flag check for deferred callbacks * tag 'sched_ext-for-6.18-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: fix flag check for deferred callbacks sched_ext: Fix scx_kick_pseqs corruption on concurrent scheduler loads sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() sched_ext: defer queue_balance_callback() until after ops.dispatch sched_ext: Sync error_irq_work before freeing scx_sched sched_ext: Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU
2 parents dcb6fa3 + a3c4a0a commit fd57572

File tree

2 files changed

+112
-15
lines changed

2 files changed

+112
-15
lines changed

kernel/sched/ext.c

Lines changed: 111 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
6767

6868
static struct delayed_work scx_watchdog_work;
6969

70-
/* for %SCX_KICK_WAIT */
71-
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
70+
/*
71+
* For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
72+
* numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
73+
* allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
74+
* lazily when enabling and freed when disabling to avoid waste when sched_ext
75+
* isn't active.
76+
*/
77+
struct scx_kick_pseqs {
78+
struct rcu_head rcu;
79+
unsigned long seqs[];
80+
};
81+
82+
static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
7283

7384
/*
7485
* Direct dispatch marker.
@@ -780,13 +791,23 @@ static void schedule_deferred(struct rq *rq)
780791
if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
781792
return;
782793

794+
/* Don't do anything if there already is a deferred operation. */
795+
if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
796+
return;
797+
783798
/*
784799
* If in balance, the balance callbacks will be called before rq lock is
785800
* released. Schedule one.
801+
*
802+
*
803+
* We can't directly insert the callback into the
804+
* rq's list: The call can drop its lock and make the pending balance
805+
* callback visible to unrelated code paths that call rq_pin_lock().
806+
*
807+
* Just let balance_one() know that it must do it itself.
786808
*/
787809
if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
788-
queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
789-
deferred_bal_cb_workfn);
810+
rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
790811
return;
791812
}
792813

@@ -2003,6 +2024,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
20032024
dspc->cursor = 0;
20042025
}
20052026

2027+
static inline void maybe_queue_balance_callback(struct rq *rq)
2028+
{
2029+
lockdep_assert_rq_held(rq);
2030+
2031+
if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
2032+
return;
2033+
2034+
queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
2035+
deferred_bal_cb_workfn);
2036+
2037+
rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
2038+
}
2039+
20062040
static int balance_one(struct rq *rq, struct task_struct *prev)
20072041
{
20082042
struct scx_sched *sch = scx_root;
@@ -2150,6 +2184,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
21502184
#endif
21512185
rq_repin_lock(rq, rf);
21522186

2187+
maybe_queue_balance_callback(rq);
2188+
21532189
return ret;
21542190
}
21552191

@@ -3471,7 +3507,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
34713507
struct scx_dispatch_q *dsq;
34723508
int node;
34733509

3510+
irq_work_sync(&sch->error_irq_work);
34743511
kthread_stop(sch->helper->task);
3512+
34753513
free_percpu(sch->pcpu);
34763514

34773515
for_each_node_state(node, N_POSSIBLE)
@@ -3850,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
38503888
}
38513889
}
38523890

3891+
static void free_kick_pseqs_rcu(struct rcu_head *rcu)
3892+
{
3893+
struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
3894+
3895+
kvfree(pseqs);
3896+
}
3897+
3898+
static void free_kick_pseqs(void)
3899+
{
3900+
int cpu;
3901+
3902+
for_each_possible_cpu(cpu) {
3903+
struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
3904+
struct scx_kick_pseqs *to_free;
3905+
3906+
to_free = rcu_replace_pointer(*pseqs, NULL, true);
3907+
if (to_free)
3908+
call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
3909+
}
3910+
}
3911+
38533912
static void scx_disable_workfn(struct kthread_work *work)
38543913
{
38553914
struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3986,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
39864045
free_percpu(scx_dsp_ctx);
39874046
scx_dsp_ctx = NULL;
39884047
scx_dsp_max_batch = 0;
4048+
free_kick_pseqs();
39894049

39904050
mutex_unlock(&scx_enable_mutex);
39914051

@@ -4348,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
43484408
irq_work_queue(&sch->error_irq_work);
43494409
}
43504410

4411+
static int alloc_kick_pseqs(void)
4412+
{
4413+
int cpu;
4414+
4415+
/*
4416+
* Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
4417+
* can exceed percpu allocator limits on large machines.
4418+
*/
4419+
for_each_possible_cpu(cpu) {
4420+
struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
4421+
struct scx_kick_pseqs *new_pseqs;
4422+
4423+
WARN_ON_ONCE(rcu_access_pointer(*pseqs));
4424+
4425+
new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
4426+
GFP_KERNEL, cpu_to_node(cpu));
4427+
if (!new_pseqs) {
4428+
free_kick_pseqs();
4429+
return -ENOMEM;
4430+
}
4431+
4432+
rcu_assign_pointer(*pseqs, new_pseqs);
4433+
}
4434+
4435+
return 0;
4436+
}
4437+
43514438
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
43524439
{
43534440
struct scx_sched *sch;
@@ -4495,10 +4582,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
44954582
goto err_unlock;
44964583
}
44974584

4585+
ret = alloc_kick_pseqs();
4586+
if (ret)
4587+
goto err_unlock;
4588+
44984589
sch = scx_alloc_and_add_sched(ops);
44994590
if (IS_ERR(sch)) {
45004591
ret = PTR_ERR(sch);
4501-
goto err_unlock;
4592+
goto err_free_pseqs;
45024593
}
45034594

45044595
/*
@@ -4701,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
47014792

47024793
return 0;
47034794

4795+
err_free_pseqs:
4796+
free_kick_pseqs();
47044797
err_unlock:
47054798
mutex_unlock(&scx_enable_mutex);
47064799
return ret;
@@ -5082,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
50825175
{
50835176
struct rq *this_rq = this_rq();
50845177
struct scx_rq *this_scx = &this_rq->scx;
5085-
unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
5178+
struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
50865179
bool should_wait = false;
5180+
unsigned long *pseqs;
50875181
s32 cpu;
50885182

5183+
if (unlikely(!pseqs_pcpu)) {
5184+
pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
5185+
return;
5186+
}
5187+
5188+
pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
5189+
50895190
for_each_cpu(cpu, this_scx->cpus_to_kick) {
50905191
should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
50915192
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5208,11 +5309,6 @@ void __init init_sched_ext_class(void)
52085309

52095310
scx_idle_init_masks();
52105311

5211-
scx_kick_cpus_pnt_seqs =
5212-
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
5213-
__alignof__(scx_kick_cpus_pnt_seqs[0]));
5214-
BUG_ON(!scx_kick_cpus_pnt_seqs);
5215-
52165312
for_each_possible_cpu(cpu) {
52175313
struct rq *rq = cpu_rq(cpu);
52185314
int n = cpu_to_node(cpu);
@@ -5688,8 +5784,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
56885784
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
56895785
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
56905786
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
5691-
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
5692-
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
5787+
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
5788+
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
56935789
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
56945790
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
56955791
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
@@ -5820,8 +5916,8 @@ __bpf_kfunc_end_defs();
58205916

58215917
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
58225918
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
5823-
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
5824-
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
5919+
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
5920+
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
58255921
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
58265922
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
58275923
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,7 @@ enum scx_rq_flags {
784784
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
785785
SCX_RQ_BYPASSING = 1 << 4,
786786
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
787+
SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */
787788

788789
SCX_RQ_IN_WAKEUP = 1 << 16,
789790
SCX_RQ_IN_BALANCE = 1 << 17,

0 commit comments

Comments
 (0)