Skip to content

Commit 2e97e2d

Browse files
committed
sched/psi: Optimize psi_group_change() cpu_clock() usage
JIRA: https://issues.redhat.com/browse/RHEL-110301 commit 570c8ef Author: Peter Zijlstra <peterz@infradead.org> Date: Fri May 23 17:28:00 2025 +0200 sched/psi: Optimize psi_group_change() cpu_clock() usage Dietmar reported that commit 3840cbe ("sched: psi: fix bogus pressure spikes from aggregation race") caused a regression for him on a high context switch rate benchmark (schbench) due to the now repeating cpu_clock() calls. In particular the problem is that get_recent_times() will extrapolate the current state to 'now'. But if an update uses a timestamp from before the start of the update, it is possible to get two reads with inconsistent results. It is effectively back-dating an update. (note that this all hard-relies on the clock being synchronized across CPUs -- if this is not the case, all bets are off). Combine this problem with the fact that there are per-group-per-cpu seqcounts, the commit in question pushed the clock read into the group iteration, causing tree-depth cpu_clock() calls. On architectures where cpu_clock() has appreciable overhead, this hurts. Instead move to a per-cpu seqcount, which allows us to have a single clock read for all group updates, increasing internal consistency and lowering update overhead. This comes at the cost of a longer update side (proportional to the tree depth) which can cause the read side to retry more often. Fixes: 3840cbe ("sched: psi: fix bogus pressure spikes from aggregation race") Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>, Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net Signed-off-by: Phil Auld <pauld@redhat.com>
1 parent c8e2808 commit 2e97e2d

File tree

2 files changed

+68
-59
lines changed

2 files changed

+68
-59
lines changed

include/linux/psi_types.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,9 @@ enum psi_aggregators {
8484
struct psi_group_cpu {
8585
/* 1st cacheline updated by the scheduler */
8686

87-
/* Aggregator needs to know of concurrent changes */
88-
seqcount_t seq ____cacheline_aligned_in_smp;
89-
9087
/* States of the tasks belonging to this group */
91-
unsigned int tasks[NR_PSI_TASK_COUNTS];
88+
unsigned int tasks[NR_PSI_TASK_COUNTS]
89+
____cacheline_aligned_in_smp;
9290

9391
/* Aggregate pressure state derived from the tasks */
9492
u32 state_mask;

kernel/sched/psi.c

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,28 @@ struct psi_group psi_system = {
176176
.pcpu = &system_group_pcpu,
177177
};
178178

179+
static DEFINE_PER_CPU(seqcount_t, psi_seq);
180+
181+
static inline void psi_write_begin(int cpu)
182+
{
183+
write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
184+
}
185+
186+
static inline void psi_write_end(int cpu)
187+
{
188+
write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
189+
}
190+
191+
static inline u32 psi_read_begin(int cpu)
192+
{
193+
return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
194+
}
195+
196+
static inline bool psi_read_retry(int cpu, u32 seq)
197+
{
198+
return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
199+
}
200+
179201
static void psi_avgs_work(struct work_struct *work);
180202

181203
static void poll_timer_fn(struct timer_list *t);
@@ -186,7 +208,7 @@ static void group_init(struct psi_group *group)
186208

187209
group->enabled = true;
188210
for_each_possible_cpu(cpu)
189-
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
211+
seqcount_init(per_cpu_ptr(&psi_seq, cpu));
190212
group->avg_last_update = sched_clock();
191213
group->avg_next_update = group->avg_last_update + psi_period;
192214
mutex_init(&group->avgs_lock);
@@ -266,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
266288

267289
/* Snapshot a coherent view of the CPU state */
268290
do {
269-
seq = read_seqcount_begin(&groupc->seq);
291+
seq = psi_read_begin(cpu);
270292
now = cpu_clock(cpu);
271293
memcpy(times, groupc->times, sizeof(groupc->times));
272294
state_mask = groupc->state_mask;
273295
state_start = groupc->state_start;
274296
if (cpu == current_cpu)
275297
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
276-
} while (read_seqcount_retry(&groupc->seq, seq));
298+
} while (psi_read_retry(cpu, seq));
277299

278300
/* Calculate state time deltas against the previous snapshot */
279301
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -772,30 +794,20 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
772794
groupc->times[PSI_NONIDLE] += delta;
773795
}
774796

797+
#define for_each_group(iter, group) \
798+
for (typeof(group) iter = group; iter; iter = iter->parent)
799+
775800
static void psi_group_change(struct psi_group *group, int cpu,
776801
unsigned int clear, unsigned int set,
777-
bool wake_clock)
802+
u64 now, bool wake_clock)
778803
{
779804
struct psi_group_cpu *groupc;
780805
unsigned int t, m;
781806
u32 state_mask;
782-
u64 now;
783807

784808
lockdep_assert_rq_held(cpu_rq(cpu));
785809
groupc = per_cpu_ptr(group->pcpu, cpu);
786810

787-
/*
788-
* First we update the task counts according to the state
789-
* change requested through the @clear and @set bits.
790-
*
791-
* Then if the cgroup PSI stats accounting enabled, we
792-
* assess the aggregate resource states this CPU's tasks
793-
* have been in since the last change, and account any
794-
* SOME and FULL time these may have resulted in.
795-
*/
796-
write_seqcount_begin(&groupc->seq);
797-
now = cpu_clock(cpu);
798-
799811
/*
800812
* Start with TSK_ONCPU, which doesn't have a corresponding
801813
* task count - it's just a boolean flag directly encoded in
@@ -847,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
847859

848860
groupc->state_mask = state_mask;
849861

850-
write_seqcount_end(&groupc->seq);
851862
return;
852863
}
853864

@@ -868,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
868879

869880
groupc->state_mask = state_mask;
870881

871-
write_seqcount_end(&groupc->seq);
872-
873882
if (state_mask & group->rtpoll_states)
874883
psi_schedule_rtpoll_work(group, 1, false);
875884

@@ -904,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
904913
void psi_task_change(struct task_struct *task, int clear, int set)
905914
{
906915
int cpu = task_cpu(task);
907-
struct psi_group *group;
916+
u64 now;
908917

909918
if (!task->pid)
910919
return;
911920

912921
psi_flags_change(task, clear, set);
913922

914-
group = task_psi_group(task);
915-
do {
916-
psi_group_change(group, cpu, clear, set, true);
917-
} while ((group = group->parent));
923+
psi_write_begin(cpu);
924+
now = cpu_clock(cpu);
925+
for_each_group(group, task_psi_group(task))
926+
psi_group_change(group, cpu, clear, set, now, true);
927+
psi_write_end(cpu);
918928
}
919929

920930
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
921931
bool sleep)
922932
{
923-
struct psi_group *group, *common = NULL;
933+
struct psi_group *common = NULL;
924934
int cpu = task_cpu(prev);
935+
u64 now;
936+
937+
psi_write_begin(cpu);
938+
now = cpu_clock(cpu);
925939

926940
if (next->pid) {
927941
psi_flags_change(next, 0, TSK_ONCPU);
@@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
930944
* ancestors with @prev, those will already have @prev's
931945
* TSK_ONCPU bit set, and we can stop the iteration there.
932946
*/
933-
group = task_psi_group(next);
934-
do {
935-
if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
936-
PSI_ONCPU) {
947+
for_each_group(group, task_psi_group(next)) {
948+
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
949+
950+
if (groupc->state_mask & PSI_ONCPU) {
937951
common = group;
938952
break;
939953
}
940-
941-
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
942-
} while ((group = group->parent));
954+
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
955+
}
943956
}
944957

945958
if (prev->pid) {
@@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
972985

973986
psi_flags_change(prev, clear, set);
974987

975-
group = task_psi_group(prev);
976-
do {
988+
for_each_group(group, task_psi_group(prev)) {
977989
if (group == common)
978990
break;
979-
psi_group_change(group, cpu, clear, set, wake_clock);
980-
} while ((group = group->parent));
991+
psi_group_change(group, cpu, clear, set, now, wake_clock);
992+
}
981993

982994
/*
983995
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
987999
*/
9881000
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
9891001
clear &= ~TSK_ONCPU;
990-
for (; group; group = group->parent)
991-
psi_group_change(group, cpu, clear, set, wake_clock);
1002+
for_each_group(group, common)
1003+
psi_group_change(group, cpu, clear, set, now, wake_clock);
9921004
}
9931005
}
1006+
psi_write_end(cpu);
9941007
}
9951008

9961009
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
9971010
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
9981011
{
9991012
int cpu = task_cpu(curr);
1000-
struct psi_group *group;
10011013
struct psi_group_cpu *groupc;
10021014
s64 delta;
10031015
u64 irq;
1016+
u64 now;
10041017

10051018
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
10061019
return;
@@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
10091022
return;
10101023

10111024
lockdep_assert_rq_held(rq);
1012-
group = task_psi_group(curr);
1013-
if (prev && task_psi_group(prev) == group)
1025+
if (prev && task_psi_group(prev) == task_psi_group(curr))
10141026
return;
10151027

10161028
irq = irq_time_read(cpu);
@@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
10191031
return;
10201032
rq->psi_irq_time = irq;
10211033

1022-
do {
1023-
u64 now;
1034+
psi_write_begin(cpu);
1035+
now = cpu_clock(cpu);
10241036

1037+
for_each_group(group, task_psi_group(curr)) {
10251038
if (!group->enabled)
10261039
continue;
10271040

10281041
groupc = per_cpu_ptr(group->pcpu, cpu);
10291042

1030-
write_seqcount_begin(&groupc->seq);
1031-
now = cpu_clock(cpu);
1032-
10331043
record_times(groupc, now);
10341044
groupc->times[PSI_IRQ_FULL] += delta;
10351045

1036-
write_seqcount_end(&groupc->seq);
1037-
10381046
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
10391047
psi_schedule_rtpoll_work(group, 1, false);
1040-
} while ((group = group->parent));
1048+
}
1049+
psi_write_end(cpu);
10411050
}
10421051
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
10431052

@@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group)
12251234
return;
12261235

12271236
for_each_possible_cpu(cpu) {
1228-
struct rq *rq = cpu_rq(cpu);
1229-
struct rq_flags rf;
1237+
u64 now;
12301238

1231-
rq_lock_irq(rq, &rf);
1232-
psi_group_change(group, cpu, 0, 0, true);
1233-
rq_unlock_irq(rq, &rf);
1239+
guard(rq_lock_irq)(cpu_rq(cpu));
1240+
1241+
psi_write_begin(cpu);
1242+
now = cpu_clock(cpu);
1243+
psi_group_change(group, cpu, 0, 0, now, true);
1244+
psi_write_end(cpu);
12341245
}
12351246
}
12361247
#endif /* CONFIG_CGROUPS */

0 commit comments

Comments
 (0)