Skip to content

Commit 100e22b

Browse files
committed
smp,csd: Throw an error if a CSD lock is stuck for too long
JIRA: https://issues.redhat.com/browse/RHEL-16867 commit 94b3f0b Author: Rik van Riel <riel@surriel.com> Date: Mon, 21 Aug 2023 16:04:09 -0400 smp,csd: Throw an error if a CSD lock is stuck for too long The CSD lock seems to get stuck in 2 "modes". When it gets stuck temporarily, it usually gets released in a few seconds, and sometimes up to one or two minutes. If the CSD lock stays stuck for more than several minutes, it never seems to get unstuck, and gradually more and more things in the system end up also getting stuck. In the latter case, we should just give up, so the system can dump out a little more information about what went wrong, and, with panic_on_oops and a kdump kernel loaded, dump a whole bunch more information about what might have gone wrong. In addition, there is an smp.panic_on_ipistall kernel boot parameter that by default retains the old behavior, but when set enables the panic after the CSD lock has been stuck for more than the specified number of milliseconds, as in 300,000 for five minutes. [ paulmck: Apply Imran Khan feedback. ] [ paulmck: Apply Leonardo Bras feedback. ] Link: https://lore.kernel.org/lkml/bc7cc8b0-f587-4451-8bcd-0daae627bcc7@paulmck-laptop/ Signed-off-by: Rik van Riel <riel@surriel.com> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Reviewed-by: Imran Khan <imran.f.khan@oracle.com> Reviewed-by: Leonardo Bras <leobras@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Valentin Schneider <vschneid@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Waiman Long <longman@redhat.com>
1 parent 04fa877 commit 100e22b

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5979,6 +5979,13 @@
59795979
This feature may be more efficiently disabled
59805980
using the csdlock_debug- kernel parameter.
59815981

5982+
smp.panic_on_ipistall= [KNL]
5983+
If a csd_lock_timeout extends for more than
5984+
the specified number of milliseconds, panic the
5985+
system. By default, let CSD-lock acquisition
5986+
take as long as they take. Specifying 300,000
5987+
for this value provides a 5-minute timeout.
5988+
59825989
smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
59835990
smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
59845991
smsc-ircc2.ircc_sir= [HW] SIR base I/O port

kernel/smp.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
165165

166166
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
167167
module_param(csd_lock_timeout, ulong, 0444);
168+
static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */
169+
module_param(panic_on_ipistall, int, 0444);
168170

169171
static atomic_t csd_bug_count = ATOMIC_INIT(0);
170172

@@ -225,6 +227,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
225227
}
226228

227229
ts2 = sched_clock();
230+
/* How long since we last checked for a stuck CSD lock.*/
228231
ts_delta = ts2 - *ts1;
229232
if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
230233
return false;
@@ -238,9 +241,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
238241
else
239242
cpux = cpu;
240243
cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
244+
/* How long since this CSD lock was stuck. */
245+
ts_delta = ts2 - ts0;
241246
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
242-
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
247+
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
243248
cpu, csd->func, csd->info);
249+
/*
250+
* If the CSD lock is still stuck after 5 minutes, it is unlikely
251+
* to become unstuck. Use a signed comparison to avoid triggering
252+
* on underflows when the TSC is out of sync between sockets.
253+
*/
254+
BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
244255
if (cpu_cur_csd && csd != cpu_cur_csd) {
245256
pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
246257
*bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),

0 commit comments

Comments
 (0)