@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
6767
6868static struct delayed_work scx_watchdog_work ;
6969
70- /* for %SCX_KICK_WAIT */
71- static unsigned long __percpu * scx_kick_cpus_pnt_seqs ;
70+ /*
71+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
72+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
73+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
74+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
75+ * isn't active.
76+ */
77+ struct scx_kick_pseqs {
78+ struct rcu_head rcu ;
79+ unsigned long seqs [];
80+ };
81+
82+ static DEFINE_PER_CPU (struct scx_kick_pseqs __rcu * , scx_kick_pseqs ) ;
7283
7384/*
7485 * Direct dispatch marker.
@@ -780,13 +791,23 @@ static void schedule_deferred(struct rq *rq)
780791 if (rq -> scx .flags & SCX_RQ_IN_WAKEUP )
781792 return ;
782793
794+ /* Don't do anything if there already is a deferred operation. */
795+ if (rq -> scx .flags & SCX_RQ_BAL_CB_PENDING )
796+ return ;
797+
783798 /*
784799 * If in balance, the balance callbacks will be called before rq lock is
785800 * released. Schedule one.
801+ *
802+ *
803+ * We can't directly insert the callback into the
804+ * rq's list: The call can drop its lock and make the pending balance
805+ * callback visible to unrelated code paths that call rq_pin_lock().
806+ *
807+ * Just let balance_one() know that it must do it itself.
786808 */
787809 if (rq -> scx .flags & SCX_RQ_IN_BALANCE ) {
788- queue_balance_callback (rq , & rq -> scx .deferred_bal_cb ,
789- deferred_bal_cb_workfn );
810+ rq -> scx .flags |= SCX_RQ_BAL_CB_PENDING ;
790811 return ;
791812 }
792813
@@ -2003,6 +2024,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
20032024 dspc -> cursor = 0 ;
20042025}
20052026
2027+ static inline void maybe_queue_balance_callback (struct rq * rq )
2028+ {
2029+ lockdep_assert_rq_held (rq );
2030+
2031+ if (!(rq -> scx .flags & SCX_RQ_BAL_CB_PENDING ))
2032+ return ;
2033+
2034+ queue_balance_callback (rq , & rq -> scx .deferred_bal_cb ,
2035+ deferred_bal_cb_workfn );
2036+
2037+ rq -> scx .flags &= ~SCX_RQ_BAL_CB_PENDING ;
2038+ }
2039+
20062040static int balance_one (struct rq * rq , struct task_struct * prev )
20072041{
20082042 struct scx_sched * sch = scx_root ;
@@ -2150,6 +2184,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
21502184#endif
21512185 rq_repin_lock (rq , rf );
21522186
2187+ maybe_queue_balance_callback (rq );
2188+
21532189 return ret ;
21542190}
21552191
@@ -3471,7 +3507,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
34713507 struct scx_dispatch_q * dsq ;
34723508 int node ;
34733509
3510+ irq_work_sync (& sch -> error_irq_work );
34743511 kthread_stop (sch -> helper -> task );
3512+
34753513 free_percpu (sch -> pcpu );
34763514
34773515 for_each_node_state (node , N_POSSIBLE )
@@ -3850,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
38503888 }
38513889}
38523890
3891+ static void free_kick_pseqs_rcu (struct rcu_head * rcu )
3892+ {
3893+ struct scx_kick_pseqs * pseqs = container_of (rcu , struct scx_kick_pseqs , rcu );
3894+
3895+ kvfree (pseqs );
3896+ }
3897+
3898+ static void free_kick_pseqs (void )
3899+ {
3900+ int cpu ;
3901+
3902+ for_each_possible_cpu (cpu ) {
3903+ struct scx_kick_pseqs * * pseqs = per_cpu_ptr (& scx_kick_pseqs , cpu );
3904+ struct scx_kick_pseqs * to_free ;
3905+
3906+ to_free = rcu_replace_pointer (* pseqs , NULL , true);
3907+ if (to_free )
3908+ call_rcu (& to_free -> rcu , free_kick_pseqs_rcu );
3909+ }
3910+ }
3911+
38533912static void scx_disable_workfn (struct kthread_work * work )
38543913{
38553914 struct scx_sched * sch = container_of (work , struct scx_sched , disable_work );
@@ -3986,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
39864045 free_percpu (scx_dsp_ctx );
39874046 scx_dsp_ctx = NULL ;
39884047 scx_dsp_max_batch = 0 ;
4048+ free_kick_pseqs ();
39894049
39904050 mutex_unlock (& scx_enable_mutex );
39914051
@@ -4348,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
43484408 irq_work_queue (& sch -> error_irq_work );
43494409}
43504410
4411+ static int alloc_kick_pseqs (void )
4412+ {
4413+ int cpu ;
4414+
4415+ /*
4416+ * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
4417+ * can exceed percpu allocator limits on large machines.
4418+ */
4419+ for_each_possible_cpu (cpu ) {
4420+ struct scx_kick_pseqs * * pseqs = per_cpu_ptr (& scx_kick_pseqs , cpu );
4421+ struct scx_kick_pseqs * new_pseqs ;
4422+
4423+ WARN_ON_ONCE (rcu_access_pointer (* pseqs ));
4424+
4425+ new_pseqs = kvzalloc_node (struct_size (new_pseqs , seqs , nr_cpu_ids ),
4426+ GFP_KERNEL , cpu_to_node (cpu ));
4427+ if (!new_pseqs ) {
4428+ free_kick_pseqs ();
4429+ return - ENOMEM ;
4430+ }
4431+
4432+ rcu_assign_pointer (* pseqs , new_pseqs );
4433+ }
4434+
4435+ return 0 ;
4436+ }
4437+
43514438static struct scx_sched * scx_alloc_and_add_sched (struct sched_ext_ops * ops )
43524439{
43534440 struct scx_sched * sch ;
@@ -4495,10 +4582,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
44954582 goto err_unlock ;
44964583 }
44974584
4585+ ret = alloc_kick_pseqs ();
4586+ if (ret )
4587+ goto err_unlock ;
4588+
44984589 sch = scx_alloc_and_add_sched (ops );
44994590 if (IS_ERR (sch )) {
45004591 ret = PTR_ERR (sch );
4501- goto err_unlock ;
4592+ goto err_free_pseqs ;
45024593 }
45034594
45044595 /*
@@ -4701,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
47014792
47024793 return 0 ;
47034794
4795+ err_free_pseqs :
4796+ free_kick_pseqs ();
47044797err_unlock :
47054798 mutex_unlock (& scx_enable_mutex );
47064799 return ret ;
@@ -5082,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
50825175{
50835176 struct rq * this_rq = this_rq ();
50845177 struct scx_rq * this_scx = & this_rq -> scx ;
5085- unsigned long * pseqs = this_cpu_ptr ( scx_kick_cpus_pnt_seqs );
5178+ struct scx_kick_pseqs __rcu * pseqs_pcpu = __this_cpu_read ( scx_kick_pseqs );
50865179 bool should_wait = false;
5180+ unsigned long * pseqs ;
50875181 s32 cpu ;
50885182
5183+ if (unlikely (!pseqs_pcpu )) {
5184+ pr_warn_once ("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs" );
5185+ return ;
5186+ }
5187+
5188+ pseqs = rcu_dereference_bh (pseqs_pcpu )-> seqs ;
5189+
50895190 for_each_cpu (cpu , this_scx -> cpus_to_kick ) {
50905191 should_wait |= kick_one_cpu (cpu , this_rq , pseqs );
50915192 cpumask_clear_cpu (cpu , this_scx -> cpus_to_kick );
@@ -5208,11 +5309,6 @@ void __init init_sched_ext_class(void)
52085309
52095310 scx_idle_init_masks ();
52105311
5211- scx_kick_cpus_pnt_seqs =
5212- __alloc_percpu (sizeof (scx_kick_cpus_pnt_seqs [0 ]) * nr_cpu_ids ,
5213- __alignof__(scx_kick_cpus_pnt_seqs [0 ]));
5214- BUG_ON (!scx_kick_cpus_pnt_seqs );
5215-
52165312 for_each_possible_cpu (cpu ) {
52175313 struct rq * rq = cpu_rq (cpu );
52185314 int n = cpu_to_node (cpu );
@@ -5688,8 +5784,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
56885784BTF_ID_FLAGS (func , scx_bpf_dispatch_nr_slots )
56895785BTF_ID_FLAGS (func , scx_bpf_dispatch_cancel )
56905786BTF_ID_FLAGS (func , scx_bpf_dsq_move_to_local )
5691- BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_slice )
5692- BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_vtime )
5787+ BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_slice , KF_RCU )
5788+ BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_vtime , KF_RCU )
56935789BTF_ID_FLAGS (func , scx_bpf_dsq_move , KF_RCU )
56945790BTF_ID_FLAGS (func , scx_bpf_dsq_move_vtime , KF_RCU )
56955791BTF_KFUNCS_END (scx_kfunc_ids_dispatch )
@@ -5820,8 +5916,8 @@ __bpf_kfunc_end_defs();
58205916
58215917BTF_KFUNCS_START (scx_kfunc_ids_unlocked )
58225918BTF_ID_FLAGS (func , scx_bpf_create_dsq , KF_SLEEPABLE )
5823- BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_slice )
5824- BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_vtime )
5919+ BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_slice , KF_RCU )
5920+ BTF_ID_FLAGS (func , scx_bpf_dsq_move_set_vtime , KF_RCU )
58255921BTF_ID_FLAGS (func , scx_bpf_dsq_move , KF_RCU )
58265922BTF_ID_FLAGS (func , scx_bpf_dsq_move_vtime , KF_RCU )
58275923BTF_KFUNCS_END (scx_kfunc_ids_unlocked )
0 commit comments