@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
534534 break ;
535535
536536 child = group ;
537- group = group -> parent ;
537+ /*
538+ * Pairs with the store release on group connection
539+ * to make sure group initialization is visible.
540+ */
541+ group = READ_ONCE (group -> parent );
538542 data -> childmask = child -> groupmask ;
543+ WARN_ON_ONCE (!data -> childmask );
539544 } while (group );
540545}
541546
@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
564569 while ((node = timerqueue_getnext (& group -> events ))) {
565570 evt = container_of (node , struct tmigr_event , nextevt );
566571
567- if (!evt -> ignore ) {
572+ if (!READ_ONCE ( evt -> ignore ) ) {
568573 WRITE_ONCE (group -> next_expiry , evt -> nextevt .expires );
569574 return evt ;
570575 }
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
660665 * lock is held while updating the ignore flag in idle path. So this
661666 * state change will not be lost.
662667 */
663- group -> groupevt .ignore = true;
668+ WRITE_ONCE ( group -> groupevt .ignore , true) ;
664669
665670 return walk_done ;
666671}
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
721726 union tmigr_state childstate , groupstate ;
722727 bool remote = data -> remote ;
723728 bool walk_done = false;
729+ bool ignore ;
724730 u64 nextexp ;
725731
726732 if (child ) {
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
739745 nextexp = child -> next_expiry ;
740746 evt = & child -> groupevt ;
741747
742- evt -> ignore = (nextexp == KTIME_MAX ) ? true : false;
748+ /*
749+ * This can race with concurrent idle exit (activate).
750+ * If the current writer wins, a useless remote expiration may
751+ * be scheduled. If the activate wins, the event is properly
752+ * ignored.
753+ */
754+ ignore = (nextexp == KTIME_MAX ) ? true : false;
755+ WRITE_ONCE (evt -> ignore , ignore );
743756 } else {
744757 nextexp = data -> nextexp ;
745758
746759 first_childevt = evt = data -> evt ;
760+ ignore = evt -> ignore ;
747761
748762 /*
749763 * Walking the hierarchy is required in any case when a
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
769783 * first event information of the group is updated properly and
770784 * also handled properly, so skip this fast return path.
771785 */
772- if (evt -> ignore && !remote && group -> parent )
786+ if (ignore && !remote && group -> parent )
773787 return true;
774788
775789 raw_spin_lock (& group -> lock );
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
783797 * queue when the expiry time changed only or when it could be ignored.
784798 */
785799 if (timerqueue_node_queued (& evt -> nextevt )) {
786- if ((evt -> nextevt .expires == nextexp ) && !evt -> ignore ) {
800+ if ((evt -> nextevt .expires == nextexp ) && !ignore ) {
787801 /* Make sure not to miss a new CPU event with the same expiry */
788802 evt -> cpu = first_childevt -> cpu ;
789803 goto check_toplvl ;
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
793807 WRITE_ONCE (group -> next_expiry , KTIME_MAX );
794808 }
795809
796- if (evt -> ignore ) {
810+ if (ignore ) {
797811 /*
798812 * When the next child event could be ignored (nextexp is
799813 * KTIME_MAX) and there was no remote timer handling before or
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
14871501 s .seq = 0 ;
14881502 atomic_set (& group -> migr_state , s .state );
14891503
1504+ /*
1505+ * If this is a new top-level, prepare its groupmask in advance.
1506+ * This avoids accidents where yet another new top-level is
1507+ * created in the future and made visible before the current groupmask.
1508+ */
1509+ if (list_empty (& tmigr_level_list [lvl ])) {
1510+ group -> groupmask = BIT (0 );
1511+ /*
1512+ * The previous top level has prepared its groupmask already,
1513+ * simply account it as the first child.
1514+ */
1515+ if (lvl > 0 )
1516+ group -> num_children = 1 ;
1517+ }
1518+
14901519 timerqueue_init_head (& group -> events );
14911520 timerqueue_init (& group -> groupevt .nextevt );
14921521 group -> groupevt .nextevt .expires = KTIME_MAX ;
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
15501579 raw_spin_lock_irq (& child -> lock );
15511580 raw_spin_lock_nested (& parent -> lock , SINGLE_DEPTH_NESTING );
15521581
1553- child -> parent = parent ;
1554- child -> groupmask = BIT (parent -> num_children ++ );
1582+ if (activate ) {
1583+ /*
1584+ * @child is the old top and @parent the new one. In this
1585+ * case groupmask is pre-initialized and @child already
1586+ * accounted, along with its new sibling corresponding to the
1587+ * CPU going up.
1588+ */
1589+ WARN_ON_ONCE (child -> groupmask != BIT (0 ) || parent -> num_children != 2 );
1590+ } else {
1591+ /* Adding @child for the CPU going up to @parent. */
1592+ child -> groupmask = BIT (parent -> num_children ++ );
1593+ }
1594+
1595+ /*
1596+ * Make sure parent initialization is visible before publishing it to a
1597+ * racing CPU entering/exiting idle. This RELEASE barrier enforces an
1598+ * address dependency that pairs with the READ_ONCE() in __walk_groups().
1599+ */
1600+ smp_store_release (& child -> parent , parent );
15551601
15561602 raw_spin_unlock (& parent -> lock );
15571603 raw_spin_unlock_irq (& child -> lock );
0 commit comments