2020#include "tick-internal.h"
2121#include "timekeeping_internal.h"
2222
23+ static noinline u64 cycles_to_nsec_safe (struct clocksource * cs , u64 start , u64 end )
24+ {
25+ u64 delta = clocksource_delta (end , start , cs -> mask );
26+
27+ if (likely (delta < cs -> max_cycles ))
28+ return clocksource_cyc2ns (delta , cs -> mult , cs -> shift );
29+
30+ return mul_u64_u32_shr (delta , cs -> mult , cs -> shift );
31+ }
32+
2333/**
2434 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
2535 * @mult: pointer to mult variable
@@ -95,6 +105,12 @@ static char override_name[CS_NAME_LEN];
95105static int finished_booting ;
96106static u64 suspend_start ;
97107
108+ /*
109+ * Interval: 0.5sec.
110+ */
111+ #define WATCHDOG_INTERVAL (HZ >> 1)
112+ #define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
113+
98114/*
99115 * Threshold: 0.0312s, when doubled: 0.0625s.
100116 * Also a default for cs->uncertainty_margin when registering clocks.
@@ -106,8 +122,17 @@ static u64 suspend_start;
106122 * clocksource surrounding a read of the clocksource being validated.
107123 * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
108124 * a lower bound for cs->uncertainty_margin values when registering clocks.
125+ *
126+ * The default of 500 parts per million is based on NTP's limits.
127+ * If a clocksource is good enough for NTP, it is good enough for us!
109128 */
110- #define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
129+ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
130+ #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
131+ #else
132+ #define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
133+ #endif
134+
135+ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
111136
112137#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
113138static void clocksource_watchdog_work (struct work_struct * work );
@@ -120,6 +145,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
120145static DEFINE_SPINLOCK (watchdog_lock );
121146static int watchdog_running ;
122147static atomic_t watchdog_reset_pending ;
148+ static int64_t watchdog_max_interval ;
123149
124150static inline void clocksource_watchdog_lock (unsigned long * flags )
125151{
@@ -134,11 +160,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
134160static int clocksource_watchdog_kthread (void * data );
135161static void __clocksource_change_rating (struct clocksource * cs , int rating );
136162
137- /*
138- * Interval: 0.5sec.
139- */
140- #define WATCHDOG_INTERVAL (HZ >> 1)
141-
142163static void clocksource_watchdog_work (struct work_struct * work )
143164{
144165 /*
@@ -199,9 +220,6 @@ void clocksource_mark_unstable(struct clocksource *cs)
199220 spin_unlock_irqrestore (& watchdog_lock , flags );
200221}
201222
202- ulong max_cswd_read_retries = 2 ;
203- module_param (max_cswd_read_retries , ulong , 0644 );
204- EXPORT_SYMBOL_GPL (max_cswd_read_retries );
205223static int verify_n_cpus = 8 ;
206224module_param (verify_n_cpus , int , 0644 );
207225
@@ -213,23 +231,22 @@ enum wd_read_status {
213231
214232static enum wd_read_status cs_watchdog_read (struct clocksource * cs , u64 * csnow , u64 * wdnow )
215233{
216- unsigned int nretries ;
217- u64 wd_end , wd_end2 , wd_delta ;
234+ unsigned int nretries , max_retries ;
218235 int64_t wd_delay , wd_seq_delay ;
236+ u64 wd_end , wd_end2 ;
219237
220- for (nretries = 0 ; nretries <= max_cswd_read_retries ; nretries ++ ) {
238+ max_retries = clocksource_get_max_watchdog_retry ();
239+ for (nretries = 0 ; nretries <= max_retries ; nretries ++ ) {
221240 local_irq_disable ();
222241 * wdnow = watchdog -> read (watchdog );
223242 * csnow = cs -> read (cs );
224243 wd_end = watchdog -> read (watchdog );
225244 wd_end2 = watchdog -> read (watchdog );
226245 local_irq_enable ();
227246
228- wd_delta = clocksource_delta (wd_end , * wdnow , watchdog -> mask );
229- wd_delay = clocksource_cyc2ns (wd_delta , watchdog -> mult ,
230- watchdog -> shift );
247+ wd_delay = cycles_to_nsec_safe (watchdog , * wdnow , wd_end );
231248 if (wd_delay <= WATCHDOG_MAX_SKEW ) {
232- if (nretries > 1 || nretries >= max_cswd_read_retries ) {
249+ if (nretries > 1 || nretries >= max_retries ) {
233250 pr_warn ("timekeeping watchdog on CPU%d: %s retried %d times before success\n" ,
234251 smp_processor_id (), watchdog -> name , nretries );
235252 }
@@ -245,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
245262 * report system busy, reinit the watchdog and skip the current
246263 * watchdog test.
247264 */
248- wd_delta = clocksource_delta (wd_end2 , wd_end , watchdog -> mask );
249- wd_seq_delay = clocksource_cyc2ns (wd_delta , watchdog -> mult , watchdog -> shift );
265+ wd_seq_delay = cycles_to_nsec_safe (watchdog , wd_end , wd_end2 );
250266 if (wd_seq_delay > WATCHDOG_MAX_SKEW /2 )
251267 goto skip_test ;
252268 }
@@ -335,16 +351,18 @@ void clocksource_verify_percpu(struct clocksource *cs)
335351 cpumask_clear (& cpus_ahead );
336352 cpumask_clear (& cpus_behind );
337353 cpus_read_lock ();
338- preempt_disable ();
354+ migrate_disable ();
339355 clocksource_verify_choose_cpus ();
340- if (cpumask_weight (& cpus_chosen ) == 0 ) {
341- preempt_enable ();
356+ if (cpumask_empty (& cpus_chosen )) {
357+ migrate_enable ();
342358 cpus_read_unlock ();
343359 pr_warn ("Not enough CPUs to check clocksource '%s'.\n" , cs -> name );
344360 return ;
345361 }
346362 testcpu = smp_processor_id ();
347- pr_warn ("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n" , cs -> name , testcpu , cpumask_pr_args (& cpus_chosen ));
363+ pr_info ("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n" ,
364+ cs -> name , testcpu , cpumask_pr_args (& cpus_chosen ));
365+ preempt_disable ();
348366 for_each_cpu (cpu , & cpus_chosen ) {
349367 if (cpu == testcpu )
350368 continue ;
@@ -357,14 +375,14 @@ void clocksource_verify_percpu(struct clocksource *cs)
357375 delta = (csnow_end - csnow_mid ) & cs -> mask ;
358376 if (delta < 0 )
359377 cpumask_set_cpu (cpu , & cpus_ahead );
360- delta = clocksource_delta (csnow_end , csnow_begin , cs -> mask );
361- cs_nsec = clocksource_cyc2ns (delta , cs -> mult , cs -> shift );
378+ cs_nsec = cycles_to_nsec_safe (cs , csnow_begin , csnow_end );
362379 if (cs_nsec > cs_nsec_max )
363380 cs_nsec_max = cs_nsec ;
364381 if (cs_nsec < cs_nsec_min )
365382 cs_nsec_min = cs_nsec ;
366383 }
367384 preempt_enable ();
385+ migrate_enable ();
368386 cpus_read_unlock ();
369387 if (!cpumask_empty (& cpus_ahead ))
370388 pr_warn (" CPUs %*pbl ahead of CPU %d for clocksource %s.\n" ,
@@ -378,13 +396,23 @@ void clocksource_verify_percpu(struct clocksource *cs)
378396}
379397EXPORT_SYMBOL_GPL (clocksource_verify_percpu );
380398
399+ static inline void clocksource_reset_watchdog (void )
400+ {
401+ struct clocksource * cs ;
402+
403+ list_for_each_entry (cs , & watchdog_list , wd_list )
404+ cs -> flags &= ~CLOCK_SOURCE_WATCHDOG ;
405+ }
406+
407+
381408static void clocksource_watchdog (struct timer_list * unused )
382409{
383- u64 csnow , wdnow , cslast , wdlast , delta ;
410+ int64_t wd_nsec , cs_nsec , interval ;
411+ u64 csnow , wdnow , cslast , wdlast ;
384412 int next_cpu , reset_pending ;
385- int64_t wd_nsec , cs_nsec ;
386413 struct clocksource * cs ;
387414 enum wd_read_status read_ret ;
415+ unsigned long extra_wait = 0 ;
388416 u32 md ;
389417
390418 spin_lock (& watchdog_lock );
@@ -404,13 +432,30 @@ static void clocksource_watchdog(struct timer_list *unused)
404432
405433 read_ret = cs_watchdog_read (cs , & csnow , & wdnow );
406434
407- if (read_ret != WD_READ_SUCCESS ) {
408- if (read_ret == WD_READ_UNSTABLE )
409- /* Clock readout unreliable, so give it up. */
410- __clocksource_unstable (cs );
435+ if (read_ret == WD_READ_UNSTABLE ) {
436+ /* Clock readout unreliable, so give it up. */
437+ __clocksource_unstable (cs );
411438 continue ;
412439 }
413440
441+ /*
442+ * When WD_READ_SKIP is returned, it means the system is likely
443+ * under very heavy load, where the latency of reading
444+ * watchdog/clocksource is very big, and affect the accuracy of
445+ * watchdog check. So give system some space and suspend the
446+ * watchdog check for 5 minutes.
447+ */
448+ if (read_ret == WD_READ_SKIP ) {
449+ /*
450+ * As the watchdog timer will be suspended, and
451+ * cs->last could keep unchanged for 5 minutes, reset
452+ * the counters.
453+ */
454+ clocksource_reset_watchdog ();
455+ extra_wait = HZ * 300 ;
456+ break ;
457+ }
458+
414459 /* Clocksource initialized ? */
415460 if (!(cs -> flags & CLOCK_SOURCE_WATCHDOG ) ||
416461 atomic_read (& watchdog_reset_pending )) {
@@ -420,12 +465,8 @@ static void clocksource_watchdog(struct timer_list *unused)
420465 continue ;
421466 }
422467
423- delta = clocksource_delta (wdnow , cs -> wd_last , watchdog -> mask );
424- wd_nsec = clocksource_cyc2ns (delta , watchdog -> mult ,
425- watchdog -> shift );
426-
427- delta = clocksource_delta (csnow , cs -> cs_last , cs -> mask );
428- cs_nsec = clocksource_cyc2ns (delta , cs -> mult , cs -> shift );
468+ wd_nsec = cycles_to_nsec_safe (watchdog , cs -> wd_last , wdnow );
469+ cs_nsec = cycles_to_nsec_safe (cs , cs -> cs_last , csnow );
429470 wdlast = cs -> wd_last ; /* save these in case we print them */
430471 cslast = cs -> cs_last ;
431472 cs -> cs_last = csnow ;
@@ -434,6 +475,27 @@ static void clocksource_watchdog(struct timer_list *unused)
434475 if (atomic_read (& watchdog_reset_pending ))
435476 continue ;
436477
478+ /*
479+ * The processing of timer softirqs can get delayed (usually
480+ * on account of ksoftirqd not getting to run in a timely
481+ * manner), which causes the watchdog interval to stretch.
482+ * Skew detection may fail for longer watchdog intervals
483+ * on account of fixed margins being used.
484+ * Some clocksources, e.g. acpi_pm, cannot tolerate
485+ * watchdog intervals longer than a few seconds.
486+ */
487+ interval = max (cs_nsec , wd_nsec );
488+ if (unlikely (interval > WATCHDOG_INTERVAL_MAX_NS )) {
489+ if (system_state > SYSTEM_SCHEDULING &&
490+ interval > 2 * watchdog_max_interval ) {
491+ watchdog_max_interval = interval ;
492+ pr_warn ("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n" ,
493+ cs_nsec , wd_nsec );
494+ }
495+ watchdog_timer .expires = jiffies ;
496+ continue ;
497+ }
498+
437499 /* Check the deviation from the watchdog clocksource. */
438500 md = cs -> uncertainty_margin + watchdog -> uncertainty_margin ;
439501 if (abs (cs_nsec - wd_nsec ) > md ) {
@@ -506,7 +568,7 @@ static void clocksource_watchdog(struct timer_list *unused)
506568 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
507569 */
508570 if (!timer_pending (& watchdog_timer )) {
509- watchdog_timer .expires += WATCHDOG_INTERVAL ;
571+ watchdog_timer .expires += WATCHDOG_INTERVAL + extra_wait ;
510572 add_timer_on (& watchdog_timer , next_cpu );
511573 }
512574out :
@@ -531,14 +593,6 @@ static inline void clocksource_stop_watchdog(void)
531593 watchdog_running = 0 ;
532594}
533595
534- static inline void clocksource_reset_watchdog (void )
535- {
536- struct clocksource * cs ;
537-
538- list_for_each_entry (cs , & watchdog_list , wd_list )
539- cs -> flags &= ~CLOCK_SOURCE_WATCHDOG ;
540- }
541-
542596static void clocksource_resume_watchdog (void )
543597{
544598 atomic_inc (& watchdog_reset_pending );
@@ -775,7 +829,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
775829 */
776830u64 clocksource_stop_suspend_timing (struct clocksource * cs , u64 cycle_now )
777831{
778- u64 now , delta , nsec = 0 ;
832+ u64 now , nsec = 0 ;
779833
780834 if (!suspend_clocksource )
781835 return 0 ;
@@ -790,12 +844,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
790844 else
791845 now = suspend_clocksource -> read (suspend_clocksource );
792846
793- if (now > suspend_start ) {
794- delta = clocksource_delta (now , suspend_start ,
795- suspend_clocksource -> mask );
796- nsec = mul_u64_u32_shr (delta , suspend_clocksource -> mult ,
797- suspend_clocksource -> shift );
798- }
847+ if (now > suspend_start )
848+ nsec = cycles_to_nsec_safe (suspend_clocksource , suspend_start , now );
799849
800850 /*
801851 * Disable the suspend timer to save power if current clocksource is
0 commit comments