Skip to content

Commit 6ef9e44

Browse files
committed
powerpc/watchdog: read TB close to where it is used
jira LE-1907 Rebuild_History Non-Buildable kernel-rt-5.14.0-284.30.1.rt14.315.el9_2 commit-author Nicholas Piggin <npiggin@gmail.com> commit 1f01bf9 When taking watchdog actions, printing messages, comparing and re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use and under wd_smp_lock or printing lock (if applicable). This should keep timebase mostly monotonic with kernel log messages, and could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to something a long way in the past, and causing other CPUs to appear to be stuck. These additional TB reads are all slowpath (lockup has been detected), so performance does not matter. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20211110025056.2084347-5-npiggin@gmail.com (cherry picked from commit 1f01bf9) Signed-off-by: Jonathan Maple <jmaple@ciq.com>
1 parent 8dd20e2 commit 6ef9e44

File tree

1 file changed

+14
-12
lines changed

1 file changed

+14
-12
lines changed

arch/powerpc/kernel/watchdog.c

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
161161
/* Do not panic from here because that can recurse into NMI IPI layer */
162162
}
163163

164-
static bool set_cpu_stuck(int cpu, u64 tb)
164+
static bool set_cpu_stuck(int cpu)
165165
{
166166
cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
167167
cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
@@ -170,7 +170,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
170170
*/
171171
smp_mb();
172172
if (cpumask_empty(&wd_smp_cpus_pending)) {
173-
wd_smp_last_reset_tb = tb;
173+
wd_smp_last_reset_tb = get_tb();
174174
cpumask_andnot(&wd_smp_cpus_pending,
175175
&wd_cpus_enabled,
176176
&wd_smp_cpus_stuck);
@@ -179,14 +179,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
179179
return false;
180180
}
181181

182-
static void watchdog_smp_panic(int cpu, u64 tb)
182+
static void watchdog_smp_panic(int cpu)
183183
{
184184
static cpumask_t wd_smp_cpus_ipi; // protected by reporting
185185
unsigned long flags;
186+
u64 tb;
186187
int c;
187188

188189
wd_smp_lock(&flags);
189190
/* Double check some things under lock */
191+
tb = get_tb();
190192
if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
191193
goto out;
192194
if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
@@ -200,7 +202,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
200202
continue; // should not happen
201203

202204
__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
203-
if (set_cpu_stuck(c, tb))
205+
if (set_cpu_stuck(c))
204206
break;
205207
}
206208
if (cpumask_empty(&wd_smp_cpus_ipi)) {
@@ -246,15 +248,15 @@ static void watchdog_smp_panic(int cpu, u64 tb)
246248
wd_smp_unlock(&flags);
247249
}
248250

249-
static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
251+
static void wd_smp_clear_cpu_pending(int cpu)
250252
{
251253
if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
252254
if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
253255
struct pt_regs *regs = get_irq_regs();
254256
unsigned long flags;
255257

256258
pr_emerg("CPU %d became unstuck TB:%lld\n",
257-
cpu, tb);
259+
cpu, get_tb());
258260
print_irqtrace_events(current);
259261
if (regs)
260262
show_regs(regs);
@@ -320,7 +322,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
320322
*/
321323
wd_smp_lock(&flags);
322324
if (cpumask_empty(&wd_smp_cpus_pending)) {
323-
wd_smp_last_reset_tb = tb;
325+
wd_smp_last_reset_tb = get_tb();
324326
cpumask_andnot(&wd_smp_cpus_pending,
325327
&wd_cpus_enabled,
326328
&wd_smp_cpus_stuck);
@@ -335,10 +337,10 @@ static void watchdog_timer_interrupt(int cpu)
335337

336338
per_cpu(wd_timer_tb, cpu) = tb;
337339

338-
wd_smp_clear_cpu_pending(cpu, tb);
340+
wd_smp_clear_cpu_pending(cpu);
339341

340342
if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
341-
watchdog_smp_panic(cpu, tb);
343+
watchdog_smp_panic(cpu);
342344
}
343345

344346
DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
@@ -375,7 +377,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
375377
return 0;
376378
}
377379

378-
set_cpu_stuck(cpu, tb);
380+
set_cpu_stuck(cpu);
379381

380382
wd_smp_unlock(&flags);
381383

@@ -436,7 +438,7 @@ void arch_touch_nmi_watchdog(void)
436438
tb = get_tb();
437439
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
438440
per_cpu(wd_timer_tb, cpu) = tb;
439-
wd_smp_clear_cpu_pending(cpu, tb);
441+
wd_smp_clear_cpu_pending(cpu);
440442
}
441443
}
442444
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
@@ -494,7 +496,7 @@ static void stop_watchdog(void *arg)
494496
cpumask_clear_cpu(cpu, &wd_cpus_enabled);
495497
wd_smp_unlock(&flags);
496498

497-
wd_smp_clear_cpu_pending(cpu, get_tb());
499+
wd_smp_clear_cpu_pending(cpu);
498500
}
499501

500502
static int stop_watchdog_on_cpu(unsigned int cpu)

0 commit comments

Comments
 (0)