Skip to content

Commit 8dd20e2

Browse files
committed
powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi
jira LE-1907 Rebuild_History Non-Buildable kernel-rt-5.14.0-284.30.1.rt14.315.el9_2 commit-author Nicholas Piggin <npiggin@gmail.com> commit 76521c4 There is a deadlock with the console_owner lock and the wd_smp_lock: CPU x takes the console_owner lock CPU y takes a watchdog timer interrupt and takes __wd_smp_lock CPU x takes a soft-NMI interrupt, detects deadlock, spins on __wd_smp_lock CPU y detects deadlock, tries to print something and spins on console_owner -> deadlock Change the watchdog locking scheme so wd_smp_lock protects the watchdog internal data, but "reporting" (printing, issuing NMI IPIs, taking any action outside of watchdog) uses a non-waiting exclusion. If a CPU detects a problem but can not take the reporting lock, it just returns because something else is already reporting. It will try again at some point. Typically hard lockup watchdog report usefulness is not impacted due to failure to spewing a large enough amount of data in as short a time as possible, but by messages getting garbled. Laurent debugged this and found the deadlock, and this patch is based on his general approach to avoid expensive operations while holding the lock. With the addition of the reporting exclusion. Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> [np: rework to add reporting exclusion update changelog] Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20211110025056.2084347-4-npiggin@gmail.com (cherry picked from commit 76521c4) Signed-off-by: Jonathan Maple <jmaple@ciq.com>
1 parent 2737ae4 commit 8dd20e2

File tree

1 file changed

+74
-19
lines changed

1 file changed

+74
-19
lines changed

arch/powerpc/kernel/watchdog.c

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,36 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
8989

9090
/* SMP checker bits */
9191
static unsigned long __wd_smp_lock;
92+
static unsigned long __wd_reporting;
9293
static cpumask_t wd_smp_cpus_pending;
9394
static cpumask_t wd_smp_cpus_stuck;
9495
static u64 wd_smp_last_reset_tb;
9596

97+
/*
98+
* Try to take the exclusive watchdog action / NMI IPI / printing lock.
99+
* wd_smp_lock must be held. If this fails, we should return and wait
100+
* for the watchdog to kick in again (or another CPU to trigger it).
101+
*
102+
* Importantly, if hardlockup_panic is set, wd_try_report failure should
103+
* not delay the panic, because whichever other CPU is reporting will
104+
* call panic.
105+
*/
106+
static bool wd_try_report(void)
107+
{
108+
if (__wd_reporting)
109+
return false;
110+
__wd_reporting = 1;
111+
return true;
112+
}
113+
114+
/* End printing after successful wd_try_report. wd_smp_lock not required. */
115+
static void wd_end_reporting(void)
116+
{
117+
smp_mb(); /* End printing "critical section" */
118+
WARN_ON_ONCE(__wd_reporting == 0);
119+
WRITE_ONCE(__wd_reporting, 0);
120+
}
121+
96122
static inline void wd_smp_lock(unsigned long *flags)
97123
{
98124
/*
@@ -155,6 +181,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
155181

156182
static void watchdog_smp_panic(int cpu, u64 tb)
157183
{
184+
static cpumask_t wd_smp_cpus_ipi; // protected by reporting
158185
unsigned long flags;
159186
int c;
160187

@@ -164,11 +191,26 @@ static void watchdog_smp_panic(int cpu, u64 tb)
164191
goto out;
165192
if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
166193
goto out;
167-
if (cpumask_weight(&wd_smp_cpus_pending) == 0)
194+
if (!wd_try_report())
168195
goto out;
196+
for_each_online_cpu(c) {
197+
if (!cpumask_test_cpu(c, &wd_smp_cpus_pending))
198+
continue;
199+
if (c == cpu)
200+
continue; // should not happen
201+
202+
__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
203+
if (set_cpu_stuck(c, tb))
204+
break;
205+
}
206+
if (cpumask_empty(&wd_smp_cpus_ipi)) {
207+
wd_end_reporting();
208+
goto out;
209+
}
210+
wd_smp_unlock(&flags);
169211

170212
pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
171-
cpu, cpumask_pr_args(&wd_smp_cpus_pending));
213+
cpu, cpumask_pr_args(&wd_smp_cpus_ipi));
172214
pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
173215
cpu, tb, wd_smp_last_reset_tb,
174216
tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000);
@@ -178,22 +220,14 @@ static void watchdog_smp_panic(int cpu, u64 tb)
178220
* Try to trigger the stuck CPUs, unless we are going to
179221
* get a backtrace on all of them anyway.
180222
*/
181-
for_each_cpu(c, &wd_smp_cpus_pending) {
182-
bool empty;
183-
if (c == cpu)
184-
continue;
185-
/* Take the stuck CPUs out of the watch group */
186-
empty = set_cpu_stuck(c, tb);
223+
for_each_cpu(c, &wd_smp_cpus_ipi) {
187224
smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
188-
if (empty)
189-
break;
225+
__cpumask_clear_cpu(c, &wd_smp_cpus_ipi);
190226
}
191-
}
192-
193-
wd_smp_unlock(&flags);
194-
195-
if (sysctl_hardlockup_all_cpu_backtrace)
227+
} else {
196228
trigger_allbutself_cpu_backtrace();
229+
cpumask_clear(&wd_smp_cpus_ipi);
230+
}
197231

198232
/*
199233
* Force flush any remote buffers that might be stuck in IRQ context
@@ -204,6 +238,8 @@ static void watchdog_smp_panic(int cpu, u64 tb)
204238
if (hardlockup_panic)
205239
nmi_panic(NULL, "Hard LOCKUP");
206240

241+
wd_end_reporting();
242+
207243
return;
208244

209245
out:
@@ -217,8 +253,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
217253
struct pt_regs *regs = get_irq_regs();
218254
unsigned long flags;
219255

220-
wd_smp_lock(&flags);
221-
222256
pr_emerg("CPU %d became unstuck TB:%lld\n",
223257
cpu, tb);
224258
print_irqtrace_events(current);
@@ -227,6 +261,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
227261
else
228262
dump_stack();
229263

264+
wd_smp_lock(&flags);
230265
cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
231266
wd_smp_unlock(&flags);
232267
} else {
@@ -322,13 +357,28 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
322357

323358
tb = get_tb();
324359
if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
360+
/*
361+
* Taking wd_smp_lock here means it is a soft-NMI lock, which
362+
* means we can't take any regular or irqsafe spin locks while
363+
* holding this lock. This is why timers can't printk while
364+
* holding the lock.
365+
*/
325366
wd_smp_lock(&flags);
326367
if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
327368
wd_smp_unlock(&flags);
328369
return 0;
329370
}
371+
if (!wd_try_report()) {
372+
wd_smp_unlock(&flags);
373+
/* Couldn't report, try again in 100ms */
374+
mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
375+
return 0;
376+
}
377+
330378
set_cpu_stuck(cpu, tb);
331379

380+
wd_smp_unlock(&flags);
381+
332382
pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
333383
cpu, (void *)regs->nip);
334384
pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
@@ -338,14 +388,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
338388
print_irqtrace_events(current);
339389
show_regs(regs);
340390

341-
wd_smp_unlock(&flags);
342-
343391
if (sysctl_hardlockup_all_cpu_backtrace)
344392
trigger_allbutself_cpu_backtrace();
345393

346394
if (hardlockup_panic)
347395
nmi_panic(regs, "Hard LOCKUP");
396+
397+
wd_end_reporting();
348398
}
399+
/*
400+
* We are okay to change DEC in soft_nmi_interrupt because the masked
401+
* handler has marked a DEC as pending, so the timer interrupt will be
402+
* replayed as soon as local irqs are enabled again.
403+
*/
349404
if (wd_panic_timeout_tb < 0x7fffffff)
350405
mtspr(SPRN_DEC, wd_panic_timeout_tb);
351406

0 commit comments

Comments
 (0)