sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath

auldp · auldp · commit d414c1e06941 · 2024-07-15T11:13:20.000-04:00
JIRA: https://issues.redhat.com/browse/RHEL-48226 Conflicts: Minor context differences in sched/core.c due to not having scheduler_tick() renamed sched_tick and d4dbc99 ("sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()"). commit ddae0ca Author: John Stultz <jstultz@google.com> Date: Tue Jun 18 14:58:55 2024 -0700 sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath It was reported that in moving to 6.1, a larger then 10% regression was seen in the performance of clock_gettime(CLOCK_THREAD_CPUTIME_ID,...). Using a simple reproducer, I found: 5.10: 100000000 calls in 24345994193 ns => 243.460 ns per call 100000000 calls in 24288172050 ns => 242.882 ns per call 100000000 calls in 24289135225 ns => 242.891 ns per call 6.1: 100000000 calls in 28248646742 ns => 282.486 ns per call 100000000 calls in 28227055067 ns => 282.271 ns per call 100000000 calls in 28177471287 ns => 281.775 ns per call The cause of this was finally narrowed down to the addition of psi_account_irqtime() in update_rq_clock_task(), in commit 52b1364 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"). In my initial attempt to resolve this, I leaned towards moving all accounting work out of the clock_gettime() call path, but it wasn't very pretty, so it will have to wait for a later deeper rework. Instead, Peter shared this approach: Rework psi_account_irqtime() to use its own psi_irq_time base for accounting, and move it out of the hotpath, calling it instead from sched_tick() and __schedule(). In testing this, we found the importance of ensuring psi_account_irqtime() is run under the rq_lock, which Johannes Weiner helpfully explained, so also add some lockdep annotations to make that requirement clear. With this change the performance is back in-line with 5.10: 6.1+fix: 100000000 calls in 24297324597 ns => 242.973 ns per call 100000000 calls in 24318869234 ns => 243.189 ns per call 100000000 calls in 24291564588 ns => 242.916 ns per call Reported-by: Jimmy Shiu <jimmyshiu@google.com> Originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Reviewed-by: Qais Yousef <qyousef@layalina.io> Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com Signed-off-by: Phil Auld <pauld@redhat.com>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
@@ -722,7 +722,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
-	psi_account_irqtime(rq->curr, irq_delta);
 	delayacct_irq(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5672,7 +5671,7 @@ void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *curr = rq->curr;
+	struct task_struct *curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
 	u64 resched_latency;
@@ -5684,6 +5683,9 @@ void scheduler_tick(void)
 
 	rq_lock(rq, &rf);
 
+	curr = rq->curr;
+	psi_account_irqtime(rq, curr, NULL);
+
 	update_rq_clock(rq);
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
@@ -6734,6 +6736,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		++*switch_count;
 
 		migrate_disable_switch(rq, prev);
+		psi_account_irqtime(rq, prev, next);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
@@ -772,6 +772,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	enum psi_states s;
 	u32 state_mask;
 
+	lockdep_assert_rq_held(cpu_rq(cpu));
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
 	/*
@@ -990,22 +991,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
 {
-	int cpu = task_cpu(task);
+	int cpu = task_cpu(curr);
 	struct psi_group *group;
 	struct psi_group_cpu *groupc;
-	u64 now;
+	u64 now, irq;
+	s64 delta;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (!task->pid)
+	if (!curr->pid)
+		return;
+
+	lockdep_assert_rq_held(rq);
+	group = task_psi_group(curr);
+	if (prev && task_psi_group(prev) == group)
 		return;
 
 	now = cpu_clock(cpu);
+	irq = irq_time_read(cpu);
+	delta = (s64)(irq - rq->psi_irq_time);
+	if (delta < 0)
+		return;
+	rq->psi_irq_time = irq;
 
-	group = task_psi_group(task);
 	do {
 		if (!group->enabled)
 			continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
@@ -1130,6 +1130,7 @@ struct rq {
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
+	u64			psi_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
 	u64			prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+				       struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
  * memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+				       struct task_struct *prev) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO