Skip to content

Commit 8b6c391

Browse files
committed
clocksource: Scale the watchdog read retries automatically
JIRA: https://issues.redhat.com/browse/RHEL-76143 Conflicts: A context diff in the include/linux/clocksource.h hunk due to the presence of later upstream commit 6b2e299 ("timekeeping: Provide infrastructure for converting to/from a base clock"). commit 2ed08e4 Author: Feng Tang <feng.tang@intel.com> Date: Wed, 21 Feb 2024 14:08:59 +0800 clocksource: Scale the watchdog read retries automatically On a 8-socket server the TSC is wrongly marked as 'unstable' and disabled during boot time on about one out of 120 boot attempts: clocksource: timekeeping watchdog on CPU227: wd-tsc-wd excessive read-back delay of 153560ns vs. limit of 125000ns, wd-wd read-back delay only 11440ns, attempt 3, marking tsc unstable tsc: Marking TSC unstable due to clocksource watchdog TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. sched_clock: Marking unstable (119294969739, 159204297)<-(125446229205, -5992055152) clocksource: Checking clocksource tsc synchronization from CPU 319 to CPUs 0,99,136,180,210,542,601,896. clocksource: Switched to clocksource hpet The reason is that for platform with a large number of CPUs, there are sporadic big or huge read latencies while reading the watchog/clocksource during boot or when system is under stress work load, and the frequency and maximum value of the latency goes up with the number of online CPUs. The cCurrent code already has logic to detect and filter such high latency case by reading the watchdog twice and checking the two deltas. Due to the randomness of the latency, there is a low probabilty that the first delta (latency) is big, but the second delta is small and looks valid. The watchdog code retries the readouts by default twice, which is not necessarily sufficient for systems with a large number of CPUs. There is a command line parameter 'max_cswd_read_retries' which allows to increase the number of retries, but that's not user friendly as it needs to be tweaked per system. As the number of required retries is proportional to the number of online CPUs, this parameter can be calculated at runtime. Scale and enlarge the number of retries according to the number of online CPUs and remove the command line parameter completely. [ tglx: Massaged change log and comments ] Signed-off-by: Feng Tang <feng.tang@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Jin Wang <jin1.wang@intel.com> Tested-by: Paul E. McKenney <paulmck@kernel.org> Reviewed-by: Waiman Long <longman@redhat.com> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Link: https://lore.kernel.org/r/20240221060859.1027450-1-feng.tang@intel.com Signed-off-by: Waiman Long <longman@redhat.com>
1 parent 6dff788 commit 8b6c391

File tree

5 files changed

+25
-20
lines changed

5 files changed

+25
-20
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -669,12 +669,6 @@
669669
loops can be debugged more effectively on production
670670
systems.
671671

672-
clocksource.max_cswd_read_retries= [KNL]
673-
Number of clocksource_watchdog() retries due to
674-
external delays before the clock will be marked
675-
unstable. Defaults to two retries, that is,
676-
three attempts to read the clock under test.
677-
678672
clocksource.verify_n_cpus= [KNL]
679673
Limit the number of CPUs checked for clocksources
680674
marked with CLOCK_SOURCE_VERIFY_PERCPU that

include/linux/clocksource.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,19 @@ static inline void timer_probe(void) {}
297297
#define TIMER_ACPI_DECLARE(name, table_id, fn) \
298298
ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn)
299299

300-
extern ulong max_cswd_read_retries;
300+
static inline unsigned int clocksource_get_max_watchdog_retry(void)
301+
{
302+
/*
303+
* When system is in the boot phase or under heavy workload, there
304+
* can be random big latencies during the clocksource/watchdog
305+
* read, so allow retries to filter the noise latency. As the
306+
* latency's frequency and maximum value goes up with the number of
307+
* CPUs, scale the number of retries with the number of online
308+
* CPUs.
309+
*/
310+
return (ilog2(num_online_cpus()) / 2) + 1;
311+
}
312+
301313
void clocksource_verify_percpu(struct clocksource *cs);
302314

303315
/**

kernel/time/clocksource-wdtest.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ static void wdtest_ktime_clocksource_reset(void)
104104
static int wdtest_func(void *arg)
105105
{
106106
unsigned long j1, j2;
107+
int i, max_retries;
107108
char *s;
108-
int i;
109109

110110
schedule_timeout_uninterruptible(holdoff * HZ);
111111

@@ -139,18 +139,19 @@ static int wdtest_func(void *arg)
139139
WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
140140

141141
/* Verify tsc-like stability with various numbers of errors injected. */
142-
for (i = 0; i <= max_cswd_read_retries + 1; i++) {
143-
if (i <= 1 && i < max_cswd_read_retries)
142+
max_retries = clocksource_get_max_watchdog_retry();
143+
for (i = 0; i <= max_retries + 1; i++) {
144+
if (i <= 1 && i < max_retries)
144145
s = "";
145-
else if (i <= max_cswd_read_retries)
146+
else if (i <= max_retries)
146147
s = ", expect message";
147148
else
148149
s = ", expect clock skew";
149-
pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
150+
pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
150151
WRITE_ONCE(wdtest_ktime_read_ndelays, i);
151152
schedule_timeout_uninterruptible(2 * HZ);
152153
WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
153-
WARN_ON_ONCE((i <= max_cswd_read_retries) !=
154+
WARN_ON_ONCE((i <= max_retries) !=
154155
!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
155156
wdtest_ktime_clocksource_reset();
156157
}

kernel/time/clocksource.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,6 @@ void clocksource_mark_unstable(struct clocksource *cs)
210210
spin_unlock_irqrestore(&watchdog_lock, flags);
211211
}
212212

213-
ulong max_cswd_read_retries = 2;
214-
module_param(max_cswd_read_retries, ulong, 0644);
215-
EXPORT_SYMBOL_GPL(max_cswd_read_retries);
216213
static int verify_n_cpus = 8;
217214
module_param(verify_n_cpus, int, 0644);
218215

@@ -224,11 +221,12 @@ enum wd_read_status {
224221

225222
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
226223
{
227-
unsigned int nretries;
224+
unsigned int nretries, max_retries;
228225
u64 wd_end, wd_end2, wd_delta;
229226
int64_t wd_delay, wd_seq_delay;
230227

231-
for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
228+
max_retries = clocksource_get_max_watchdog_retry();
229+
for (nretries = 0; nretries <= max_retries; nretries++) {
232230
local_irq_disable();
233231
*wdnow = watchdog->read(watchdog);
234232
*csnow = cs->read(cs);
@@ -240,7 +238,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
240238
wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
241239
watchdog->shift);
242240
if (wd_delay <= WATCHDOG_MAX_SKEW) {
243-
if (nretries > 1 || nretries >= max_cswd_read_retries) {
241+
if (nretries > 1 || nretries >= max_retries) {
244242
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
245243
smp_processor_id(), watchdog->name, nretries);
246244
}

tools/testing/selftests/rcutorture/bin/torture.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ then
408408
torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog"
409409
torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make
410410

411-
torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1 tsc=watchdog"
411+
torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog"
412412
torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make
413413

414414
# In case our work is already done...

0 commit comments

Comments
 (0)