Skip to content

Commit d414c1e

Browse files
committed
sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
JIRA: https://issues.redhat.com/browse/RHEL-48226 Conflicts: Minor context differences in sched/core.c due to not having scheduler_tick() renamed sched_tick and d4dbc99 ("sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()"). commit ddae0ca Author: John Stultz <jstultz@google.com> Date: Tue Jun 18 14:58:55 2024 -0700 sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath It was reported that in moving to 6.1, a larger then 10% regression was seen in the performance of clock_gettime(CLOCK_THREAD_CPUTIME_ID,...). Using a simple reproducer, I found: 5.10: 100000000 calls in 24345994193 ns => 243.460 ns per call 100000000 calls in 24288172050 ns => 242.882 ns per call 100000000 calls in 24289135225 ns => 242.891 ns per call 6.1: 100000000 calls in 28248646742 ns => 282.486 ns per call 100000000 calls in 28227055067 ns => 282.271 ns per call 100000000 calls in 28177471287 ns => 281.775 ns per call The cause of this was finally narrowed down to the addition of psi_account_irqtime() in update_rq_clock_task(), in commit 52b1364 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"). In my initial attempt to resolve this, I leaned towards moving all accounting work out of the clock_gettime() call path, but it wasn't very pretty, so it will have to wait for a later deeper rework. Instead, Peter shared this approach: Rework psi_account_irqtime() to use its own psi_irq_time base for accounting, and move it out of the hotpath, calling it instead from sched_tick() and __schedule(). In testing this, we found the importance of ensuring psi_account_irqtime() is run under the rq_lock, which Johannes Weiner helpfully explained, so also add some lockdep annotations to make that requirement clear. With this change the performance is back in-line with 5.10: 6.1+fix: 100000000 calls in 24297324597 ns => 242.973 ns per call 100000000 calls in 24318869234 ns => 243.189 ns per call 100000000 calls in 24291564588 ns => 242.916 ns per call Reported-by: Jimmy Shiu <jimmyshiu@google.com> Originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Reviewed-by: Qais Yousef <qyousef@layalina.io> Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com Signed-off-by: Phil Auld <pauld@redhat.com>
1 parent f2ab2fc commit d414c1e

File tree

4 files changed

+30
-10
lines changed

4 files changed

+30
-10
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
722722

723723
rq->prev_irq_time += irq_delta;
724724
delta -= irq_delta;
725-
psi_account_irqtime(rq->curr, irq_delta);
726725
delayacct_irq(rq->curr, irq_delta);
727726
#endif
728727
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5672,7 +5671,7 @@ void scheduler_tick(void)
56725671
{
56735672
int cpu = smp_processor_id();
56745673
struct rq *rq = cpu_rq(cpu);
5675-
struct task_struct *curr = rq->curr;
5674+
struct task_struct *curr;
56765675
struct rq_flags rf;
56775676
unsigned long thermal_pressure;
56785677
u64 resched_latency;
@@ -5684,6 +5683,9 @@ void scheduler_tick(void)
56845683

56855684
rq_lock(rq, &rf);
56865685

5686+
curr = rq->curr;
5687+
psi_account_irqtime(rq, curr, NULL);
5688+
56875689
update_rq_clock(rq);
56885690
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
56895691
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
@@ -6734,6 +6736,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
67346736
++*switch_count;
67356737

67366738
migrate_disable_switch(rq, prev);
6739+
psi_account_irqtime(rq, prev, next);
67376740
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
67386741

67396742
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);

kernel/sched/psi.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
772772
enum psi_states s;
773773
u32 state_mask;
774774

775+
lockdep_assert_rq_held(cpu_rq(cpu));
775776
groupc = per_cpu_ptr(group->pcpu, cpu);
776777

777778
/*
@@ -990,22 +991,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
990991
}
991992

992993
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
993-
void psi_account_irqtime(struct task_struct *task, u32 delta)
994+
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
994995
{
995-
int cpu = task_cpu(task);
996+
int cpu = task_cpu(curr);
996997
struct psi_group *group;
997998
struct psi_group_cpu *groupc;
998-
u64 now;
999+
u64 now, irq;
1000+
s64 delta;
9991001

10001002
if (static_branch_likely(&psi_disabled))
10011003
return;
10021004

1003-
if (!task->pid)
1005+
if (!curr->pid)
1006+
return;
1007+
1008+
lockdep_assert_rq_held(rq);
1009+
group = task_psi_group(curr);
1010+
if (prev && task_psi_group(prev) == group)
10041011
return;
10051012

10061013
now = cpu_clock(cpu);
1014+
irq = irq_time_read(cpu);
1015+
delta = (s64)(irq - rq->psi_irq_time);
1016+
if (delta < 0)
1017+
return;
1018+
rq->psi_irq_time = irq;
10071019

1008-
group = task_psi_group(task);
10091020
do {
10101021
if (!group->enabled)
10111022
continue;

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,7 @@ struct rq {
11301130

11311131
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
11321132
u64 prev_irq_time;
1133+
u64 psi_irq_time;
11331134
#endif
11341135
#ifdef CONFIG_PARAVIRT
11351136
u64 prev_steal_time;

kernel/sched/stats.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
110110
void psi_task_change(struct task_struct *task, int clear, int set);
111111
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
112112
bool sleep);
113-
void psi_account_irqtime(struct task_struct *task, u32 delta);
114-
113+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
114+
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
115+
#else
116+
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
117+
struct task_struct *prev) {}
118+
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
115119
/*
116120
* PSI tracks state that persists across sleeps, such as iowaits and
117121
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
192196
static inline void psi_sched_switch(struct task_struct *prev,
193197
struct task_struct *next,
194198
bool sleep) {}
195-
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
199+
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
200+
struct task_struct *prev) {}
196201
#endif /* CONFIG_PSI */
197202

198203
#ifdef CONFIG_SCHED_INFO

0 commit comments

Comments
 (0)