Skip to content

Commit 2737ae4

Browse files
committed
powerpc/watchdog: tighten non-atomic read-modify-write access
jira LE-1907 Rebuild_History Non-Buildable kernel-rt-5.14.0-284.30.1.rt14.315.el9_2 commit-author Nicholas Piggin <npiggin@gmail.com> commit 858c93c Most updates to wd_smp_cpus_pending are under lock except the watchdog interrupt bit clear. This can race with non-atomic RMW updates to the mask under lock, which can happen in two instances: Firstly, if another CPU detects this one is stuck, removes it from the mask, mask becomes empty and is re-filled with non-atomic stores. This is okay because it would re-fill the mask with this CPU's bit clear anyway (because this CPU is now stuck), so it doesn't matter that the bit clear update got "lost". Add a comment for this. Secondly, if another CPU detects a different CPU is stuck and removes it from the pending mask with a non-atomic store to bytes which also include the bit of this CPU. This case can result in the bit clear being lost and the end result being the bit is set. This should be so rare it hardly matters, but to make things simpler to reason about just avoid the non-atomic access for that case. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20211110025056.2084347-3-npiggin@gmail.com (cherry picked from commit 858c93c) Signed-off-by: Jonathan Maple <jmaple@ciq.com>
1 parent f2cdf89 commit 2737ae4

File tree

1 file changed

+26
-10
lines changed

1 file changed

+26
-10
lines changed

arch/powerpc/kernel/watchdog.c

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
135135
/* Do not panic from here because that can recurse into NMI IPI layer */
136136
}
137137

138-
static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
138+
static bool set_cpu_stuck(int cpu, u64 tb)
139139
{
140-
cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
141-
cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
140+
cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
141+
cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
142142
/*
143143
* See wd_smp_clear_cpu_pending()
144144
*/
@@ -148,11 +148,9 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
148148
cpumask_andnot(&wd_smp_cpus_pending,
149149
&wd_cpus_enabled,
150150
&wd_smp_cpus_stuck);
151+
return true;
151152
}
152-
}
153-
static void set_cpu_stuck(int cpu, u64 tb)
154-
{
155-
set_cpumask_stuck(cpumask_of(cpu), tb);
153+
return false;
156154
}
157155

158156
static void watchdog_smp_panic(int cpu, u64 tb)
@@ -181,15 +179,17 @@ static void watchdog_smp_panic(int cpu, u64 tb)
181179
* get a backtrace on all of them anyway.
182180
*/
183181
for_each_cpu(c, &wd_smp_cpus_pending) {
182+
bool empty;
184183
if (c == cpu)
185184
continue;
185+
/* Take the stuck CPUs out of the watch group */
186+
empty = set_cpu_stuck(c, tb);
186187
smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
188+
if (empty)
189+
break;
187190
}
188191
}
189192

190-
/* Take the stuck CPUs out of the watch group */
191-
set_cpumask_stuck(&wd_smp_cpus_pending, tb);
192-
193193
wd_smp_unlock(&flags);
194194

195195
if (sysctl_hardlockup_all_cpu_backtrace)
@@ -247,6 +247,22 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
247247
return;
248248
}
249249

250+
/*
251+
* All other updates to wd_smp_cpus_pending are performed under
252+
* wd_smp_lock. All of them are atomic except the case where the
253+
* mask becomes empty and is reset. This will not happen here because
254+
* cpu was tested to be in the bitmap (above), and a CPU only clears
255+
* its own bit. _Except_ in the case where another CPU has detected a
256+
* hard lockup on our CPU and takes us out of the pending mask. So in
257+
* normal operation there will be no race here, no problem.
258+
*
259+
* In the lockup case, this atomic clear-bit vs a store that refills
260+
* other bits in the accessed word wll not be a problem. The bit clear
261+
* is atomic so it will not cause the store to get lost, and the store
262+
* will never set this bit so it will not overwrite the bit clear. The
263+
* only way for a stuck CPU to return to the pending bitmap is to
264+
* become unstuck itself.
265+
*/
250266
cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
251267

252268
/*

0 commit comments

Comments
 (0)