Skip to content

Commit fb3cb7f

Browse files
committed
Merge: clocksource: Avoid calling get_random_u32() in atomic context
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6339 JIRA: https://issues.redhat.com/browse/RHEL-76143 MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6339 The last patch is the one that fix invalid context problem reported in the Jira issue. Other clocksource patches are included to reduce merge conflicts. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Tony Camuso <tcamuso@redhat.com> Approved-by: Phil Auld <pauld@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Patrick Talbert <ptalbert@redhat.com>
2 parents d1e7f16 + a287085 commit fb3cb7f

File tree

7 files changed

+139
-68
lines changed

7 files changed

+139
-68
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -669,12 +669,6 @@
669669
loops can be debugged more effectively on production
670670
systems.
671671

672-
clocksource.max_cswd_read_retries= [KNL]
673-
Number of clocksource_watchdog() retries due to
674-
external delays before the clock will be marked
675-
unstable. Defaults to two retries, that is,
676-
three attempts to read the clock under test.
677-
678672
clocksource.verify_n_cpus= [KNL]
679673
Limit the number of CPUs checked for clocksources
680674
marked with CLOCK_SOURCE_VERIFY_PERCPU that

include/linux/clocksource.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,19 @@ static inline void timer_probe(void) {}
297297
#define TIMER_ACPI_DECLARE(name, table_id, fn) \
298298
ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn)
299299

300-
extern ulong max_cswd_read_retries;
300+
static inline unsigned int clocksource_get_max_watchdog_retry(void)
301+
{
302+
/*
303+
* When system is in the boot phase or under heavy workload, there
304+
* can be random big latencies during the clocksource/watchdog
305+
* read, so allow retries to filter the noise latency. As the
306+
* latency's frequency and maximum value goes up with the number of
307+
* CPUs, scale the number of retries with the number of online
308+
* CPUs.
309+
*/
310+
return (ilog2(num_online_cpus()) / 2) + 1;
311+
}
312+
301313
void clocksource_verify_percpu(struct clocksource *cs);
302314

303315
/**

kernel/time/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,5 +196,18 @@ config HIGH_RES_TIMERS
196196
hardware is not capable then this option only increases
197197
the size of the kernel image.
198198

199+
config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
200+
int "Clocksource watchdog maximum allowable skew (in μs)"
201+
depends on CLOCKSOURCE_WATCHDOG
202+
range 50 1000
203+
default 125
204+
help
205+
Specify the maximum amount of allowable watchdog skew in
206+
microseconds before reporting the clocksource to be unstable.
207+
The default is based on a half-second clocksource watchdog
208+
interval and NTP's maximum frequency drift of 500 parts
209+
per million. If the clocksource is good enough for NTP,
210+
it is good enough for the clocksource watchdog!
211+
199212
endmenu
200213
endif

kernel/time/clocksource-wdtest.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ static void wdtest_ktime_clocksource_reset(void)
104104
static int wdtest_func(void *arg)
105105
{
106106
unsigned long j1, j2;
107+
int i, max_retries;
107108
char *s;
108-
int i;
109109

110110
schedule_timeout_uninterruptible(holdoff * HZ);
111111

@@ -139,18 +139,19 @@ static int wdtest_func(void *arg)
139139
WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
140140

141141
/* Verify tsc-like stability with various numbers of errors injected. */
142-
for (i = 0; i <= max_cswd_read_retries + 1; i++) {
143-
if (i <= 1 && i < max_cswd_read_retries)
142+
max_retries = clocksource_get_max_watchdog_retry();
143+
for (i = 0; i <= max_retries + 1; i++) {
144+
if (i <= 1 && i < max_retries)
144145
s = "";
145-
else if (i <= max_cswd_read_retries)
146+
else if (i <= max_retries)
146147
s = ", expect message";
147148
else
148149
s = ", expect clock skew";
149-
pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
150+
pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
150151
WRITE_ONCE(wdtest_ktime_read_ndelays, i);
151152
schedule_timeout_uninterruptible(2 * HZ);
152153
WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
153-
WARN_ON_ONCE((i <= max_cswd_read_retries) !=
154+
WARN_ON_ONCE((i <= max_retries) !=
154155
!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
155156
wdtest_ktime_clocksource_reset();
156157
}

kernel/time/clocksource.c

Lines changed: 102 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@
2020
#include "tick-internal.h"
2121
#include "timekeeping_internal.h"
2222

23+
static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
24+
{
25+
u64 delta = clocksource_delta(end, start, cs->mask);
26+
27+
if (likely(delta < cs->max_cycles))
28+
return clocksource_cyc2ns(delta, cs->mult, cs->shift);
29+
30+
return mul_u64_u32_shr(delta, cs->mult, cs->shift);
31+
}
32+
2333
/**
2434
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
2535
* @mult: pointer to mult variable
@@ -95,6 +105,12 @@ static char override_name[CS_NAME_LEN];
95105
static int finished_booting;
96106
static u64 suspend_start;
97107

108+
/*
109+
* Interval: 0.5sec.
110+
*/
111+
#define WATCHDOG_INTERVAL (HZ >> 1)
112+
#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
113+
98114
/*
99115
* Threshold: 0.0312s, when doubled: 0.0625s.
100116
* Also a default for cs->uncertainty_margin when registering clocks.
@@ -106,8 +122,17 @@ static u64 suspend_start;
106122
* clocksource surrounding a read of the clocksource being validated.
107123
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
108124
* a lower bound for cs->uncertainty_margin values when registering clocks.
125+
*
126+
* The default of 500 parts per million is based on NTP's limits.
127+
* If a clocksource is good enough for NTP, it is good enough for us!
109128
*/
110-
#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
129+
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
130+
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
131+
#else
132+
#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
133+
#endif
134+
135+
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
111136

112137
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
113138
static void clocksource_watchdog_work(struct work_struct *work);
@@ -120,6 +145,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
120145
static DEFINE_SPINLOCK(watchdog_lock);
121146
static int watchdog_running;
122147
static atomic_t watchdog_reset_pending;
148+
static int64_t watchdog_max_interval;
123149

124150
static inline void clocksource_watchdog_lock(unsigned long *flags)
125151
{
@@ -134,11 +160,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
134160
static int clocksource_watchdog_kthread(void *data);
135161
static void __clocksource_change_rating(struct clocksource *cs, int rating);
136162

137-
/*
138-
* Interval: 0.5sec.
139-
*/
140-
#define WATCHDOG_INTERVAL (HZ >> 1)
141-
142163
static void clocksource_watchdog_work(struct work_struct *work)
143164
{
144165
/*
@@ -199,9 +220,6 @@ void clocksource_mark_unstable(struct clocksource *cs)
199220
spin_unlock_irqrestore(&watchdog_lock, flags);
200221
}
201222

202-
ulong max_cswd_read_retries = 2;
203-
module_param(max_cswd_read_retries, ulong, 0644);
204-
EXPORT_SYMBOL_GPL(max_cswd_read_retries);
205223
static int verify_n_cpus = 8;
206224
module_param(verify_n_cpus, int, 0644);
207225

@@ -213,23 +231,22 @@ enum wd_read_status {
213231

214232
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
215233
{
216-
unsigned int nretries;
217-
u64 wd_end, wd_end2, wd_delta;
234+
unsigned int nretries, max_retries;
218235
int64_t wd_delay, wd_seq_delay;
236+
u64 wd_end, wd_end2;
219237

220-
for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
238+
max_retries = clocksource_get_max_watchdog_retry();
239+
for (nretries = 0; nretries <= max_retries; nretries++) {
221240
local_irq_disable();
222241
*wdnow = watchdog->read(watchdog);
223242
*csnow = cs->read(cs);
224243
wd_end = watchdog->read(watchdog);
225244
wd_end2 = watchdog->read(watchdog);
226245
local_irq_enable();
227246

228-
wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
229-
wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
230-
watchdog->shift);
247+
wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
231248
if (wd_delay <= WATCHDOG_MAX_SKEW) {
232-
if (nretries > 1 || nretries >= max_cswd_read_retries) {
249+
if (nretries > 1 || nretries >= max_retries) {
233250
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
234251
smp_processor_id(), watchdog->name, nretries);
235252
}
@@ -245,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
245262
* report system busy, reinit the watchdog and skip the current
246263
* watchdog test.
247264
*/
248-
wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
249-
wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
265+
wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
250266
if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
251267
goto skip_test;
252268
}
@@ -335,16 +351,18 @@ void clocksource_verify_percpu(struct clocksource *cs)
335351
cpumask_clear(&cpus_ahead);
336352
cpumask_clear(&cpus_behind);
337353
cpus_read_lock();
338-
preempt_disable();
354+
migrate_disable();
339355
clocksource_verify_choose_cpus();
340-
if (cpumask_weight(&cpus_chosen) == 0) {
341-
preempt_enable();
356+
if (cpumask_empty(&cpus_chosen)) {
357+
migrate_enable();
342358
cpus_read_unlock();
343359
pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
344360
return;
345361
}
346362
testcpu = smp_processor_id();
347-
pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
363+
pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
364+
cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
365+
preempt_disable();
348366
for_each_cpu(cpu, &cpus_chosen) {
349367
if (cpu == testcpu)
350368
continue;
@@ -357,14 +375,14 @@ void clocksource_verify_percpu(struct clocksource *cs)
357375
delta = (csnow_end - csnow_mid) & cs->mask;
358376
if (delta < 0)
359377
cpumask_set_cpu(cpu, &cpus_ahead);
360-
delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
361-
cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
378+
cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
362379
if (cs_nsec > cs_nsec_max)
363380
cs_nsec_max = cs_nsec;
364381
if (cs_nsec < cs_nsec_min)
365382
cs_nsec_min = cs_nsec;
366383
}
367384
preempt_enable();
385+
migrate_enable();
368386
cpus_read_unlock();
369387
if (!cpumask_empty(&cpus_ahead))
370388
pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
@@ -378,13 +396,23 @@ void clocksource_verify_percpu(struct clocksource *cs)
378396
}
379397
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
380398

399+
static inline void clocksource_reset_watchdog(void)
400+
{
401+
struct clocksource *cs;
402+
403+
list_for_each_entry(cs, &watchdog_list, wd_list)
404+
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
405+
}
406+
407+
381408
static void clocksource_watchdog(struct timer_list *unused)
382409
{
383-
u64 csnow, wdnow, cslast, wdlast, delta;
410+
int64_t wd_nsec, cs_nsec, interval;
411+
u64 csnow, wdnow, cslast, wdlast;
384412
int next_cpu, reset_pending;
385-
int64_t wd_nsec, cs_nsec;
386413
struct clocksource *cs;
387414
enum wd_read_status read_ret;
415+
unsigned long extra_wait = 0;
388416
u32 md;
389417

390418
spin_lock(&watchdog_lock);
@@ -404,13 +432,30 @@ static void clocksource_watchdog(struct timer_list *unused)
404432

405433
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
406434

407-
if (read_ret != WD_READ_SUCCESS) {
408-
if (read_ret == WD_READ_UNSTABLE)
409-
/* Clock readout unreliable, so give it up. */
410-
__clocksource_unstable(cs);
435+
if (read_ret == WD_READ_UNSTABLE) {
436+
/* Clock readout unreliable, so give it up. */
437+
__clocksource_unstable(cs);
411438
continue;
412439
}
413440

441+
/*
442+
* When WD_READ_SKIP is returned, it means the system is likely
443+
* under very heavy load, where the latency of reading
444+
* watchdog/clocksource is very big, and affect the accuracy of
445+
* watchdog check. So give system some space and suspend the
446+
* watchdog check for 5 minutes.
447+
*/
448+
if (read_ret == WD_READ_SKIP) {
449+
/*
450+
* As the watchdog timer will be suspended, and
451+
* cs->last could keep unchanged for 5 minutes, reset
452+
* the counters.
453+
*/
454+
clocksource_reset_watchdog();
455+
extra_wait = HZ * 300;
456+
break;
457+
}
458+
414459
/* Clocksource initialized ? */
415460
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
416461
atomic_read(&watchdog_reset_pending)) {
@@ -420,12 +465,8 @@ static void clocksource_watchdog(struct timer_list *unused)
420465
continue;
421466
}
422467

423-
delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
424-
wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
425-
watchdog->shift);
426-
427-
delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
428-
cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
468+
wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
469+
cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
429470
wdlast = cs->wd_last; /* save these in case we print them */
430471
cslast = cs->cs_last;
431472
cs->cs_last = csnow;
@@ -434,6 +475,27 @@ static void clocksource_watchdog(struct timer_list *unused)
434475
if (atomic_read(&watchdog_reset_pending))
435476
continue;
436477

478+
/*
479+
* The processing of timer softirqs can get delayed (usually
480+
* on account of ksoftirqd not getting to run in a timely
481+
* manner), which causes the watchdog interval to stretch.
482+
* Skew detection may fail for longer watchdog intervals
483+
* on account of fixed margins being used.
484+
* Some clocksources, e.g. acpi_pm, cannot tolerate
485+
* watchdog intervals longer than a few seconds.
486+
*/
487+
interval = max(cs_nsec, wd_nsec);
488+
if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
489+
if (system_state > SYSTEM_SCHEDULING &&
490+
interval > 2 * watchdog_max_interval) {
491+
watchdog_max_interval = interval;
492+
pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
493+
cs_nsec, wd_nsec);
494+
}
495+
watchdog_timer.expires = jiffies;
496+
continue;
497+
}
498+
437499
/* Check the deviation from the watchdog clocksource. */
438500
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
439501
if (abs(cs_nsec - wd_nsec) > md) {
@@ -506,7 +568,7 @@ static void clocksource_watchdog(struct timer_list *unused)
506568
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
507569
*/
508570
if (!timer_pending(&watchdog_timer)) {
509-
watchdog_timer.expires += WATCHDOG_INTERVAL;
571+
watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
510572
add_timer_on(&watchdog_timer, next_cpu);
511573
}
512574
out:
@@ -531,14 +593,6 @@ static inline void clocksource_stop_watchdog(void)
531593
watchdog_running = 0;
532594
}
533595

534-
static inline void clocksource_reset_watchdog(void)
535-
{
536-
struct clocksource *cs;
537-
538-
list_for_each_entry(cs, &watchdog_list, wd_list)
539-
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
540-
}
541-
542596
static void clocksource_resume_watchdog(void)
543597
{
544598
atomic_inc(&watchdog_reset_pending);
@@ -775,7 +829,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
775829
*/
776830
u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
777831
{
778-
u64 now, delta, nsec = 0;
832+
u64 now, nsec = 0;
779833

780834
if (!suspend_clocksource)
781835
return 0;
@@ -790,12 +844,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
790844
else
791845
now = suspend_clocksource->read(suspend_clocksource);
792846

793-
if (now > suspend_start) {
794-
delta = clocksource_delta(now, suspend_start,
795-
suspend_clocksource->mask);
796-
nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
797-
suspend_clocksource->shift);
798-
}
847+
if (now > suspend_start)
848+
nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
799849

800850
/*
801851
* Disable the suspend timer to save power if current clocksource is

0 commit comments

Comments
 (0)