@@ -60,6 +60,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
6060
6161#define BLKG_DESTROY_BATCH_SIZE 64
6262
63+ /*
64+ * Lockless lists for tracking IO stats update
65+ *
66+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
67+ * There are multiple blkg's (one for each block device) attached to each
68+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
69+ * but it doesn't know which blkg has the updated stats. If there are many
70+ * block devices in a system, the cost of iterating all the blkg's to flush
71+ * out the IO stats can be high. To reduce such overhead, a set of percpu
72+ * lockless lists (lhead) per blkcg are used to track the set of recently
73+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
74+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
75+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
76+ * References to blkg are gotten and then put back in the process to
77+ * protect against blkg removal.
78+ *
79+ * Return: 0 if successful or -ENOMEM if allocation fails.
80+ */
81+ static int init_blkcg_llists (struct blkcg * blkcg )
82+ {
83+ int cpu ;
84+
85+ blkcg -> lhead = alloc_percpu_gfp (struct llist_head , GFP_KERNEL );
86+ if (!blkcg -> lhead )
87+ return - ENOMEM ;
88+
89+ for_each_possible_cpu (cpu )
90+ init_llist_head (per_cpu_ptr (blkcg -> lhead , cpu ));
91+ return 0 ;
92+ }
93+
6394/**
6495 * blkcg_css - find the current css
6596 *
@@ -237,8 +268,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
237268 blkg -> blkcg = blkcg ;
238269
239270 u64_stats_init (& blkg -> iostat .sync );
240- for_each_possible_cpu (cpu )
271+ for_each_possible_cpu (cpu ) {
241272 u64_stats_init (& per_cpu_ptr (blkg -> iostat_cpu , cpu )-> sync );
273+ per_cpu_ptr (blkg -> iostat_cpu , cpu )-> blkg = blkg ;
274+ }
242275
243276 for (i = 0 ; i < BLKCG_MAX_POLS ; i ++ ) {
244277 struct blkcg_policy * pol = blkcg_policy [i ];
@@ -828,20 +861,31 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
828861static void blkcg_rstat_flush (struct cgroup_subsys_state * css , int cpu )
829862{
830863 struct blkcg * blkcg = css_to_blkcg (css );
831- struct blkcg_gq * blkg ;
864+ struct llist_head * lhead = per_cpu_ptr (blkcg -> lhead , cpu );
865+ struct llist_node * lnode ;
866+ struct blkg_iostat_set * bisc , * next_bisc ;
832867
833868 /* Root-level stats are sourced from system-wide IO stats */
834869 if (!cgroup_parent (css -> cgroup ))
835870 return ;
836871
837872 rcu_read_lock ();
838873
839- hlist_for_each_entry_rcu (blkg , & blkcg -> blkg_list , blkcg_node ) {
874+ lnode = llist_del_all (lhead );
875+ if (!lnode )
876+ goto out ;
877+
878+ /*
879+ * Iterate only the iostat_cpu's queued in the lockless list.
880+ */
881+ llist_for_each_entry_safe (bisc , next_bisc , lnode , lnode ) {
882+ struct blkcg_gq * blkg = bisc -> blkg ;
840883 struct blkcg_gq * parent = blkg -> parent ;
841- struct blkg_iostat_set * bisc = per_cpu_ptr (blkg -> iostat_cpu , cpu );
842884 struct blkg_iostat cur ;
843885 unsigned int seq ;
844886
887+ WRITE_ONCE (bisc -> lqueued , false);
888+
845889 /* fetch the current per-cpu values */
846890 do {
847891 seq = u64_stats_fetch_begin (& bisc -> sync );
@@ -854,8 +898,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
854898 if (parent && parent -> parent )
855899 blkcg_iostat_update (parent , & blkg -> iostat .cur ,
856900 & blkg -> iostat .last );
901+ percpu_ref_put (& blkg -> refcnt );
857902 }
858903
904+ out :
859905 rcu_read_unlock ();
860906}
861907
@@ -1133,14 +1179,14 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
11331179
11341180 mutex_unlock (& blkcg_pol_mutex );
11351181
1182+ free_percpu (blkcg -> lhead );
11361183 kfree (blkcg );
11371184}
11381185
11391186static struct cgroup_subsys_state *
11401187blkcg_css_alloc (struct cgroup_subsys_state * parent_css )
11411188{
11421189 struct blkcg * blkcg ;
1143- struct cgroup_subsys_state * ret ;
11441190 int i ;
11451191
11461192 mutex_lock (& blkcg_pol_mutex );
@@ -1149,12 +1195,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11491195 blkcg = & blkcg_root ;
11501196 } else {
11511197 blkcg = kzalloc (sizeof (* blkcg ), GFP_KERNEL );
1152- if (!blkcg ) {
1153- ret = ERR_PTR (- ENOMEM );
1198+ if (!blkcg )
11541199 goto unlock ;
1155- }
11561200 }
11571201
1202+ if (init_blkcg_llists (blkcg ))
1203+ goto free_blkcg ;
1204+
11581205 for (i = 0 ; i < BLKCG_MAX_POLS ; i ++ ) {
11591206 struct blkcg_policy * pol = blkcg_policy [i ];
11601207 struct blkcg_policy_data * cpd ;
@@ -1169,10 +1216,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11691216 continue ;
11701217
11711218 cpd = pol -> cpd_alloc_fn (GFP_KERNEL );
1172- if (!cpd ) {
1173- ret = ERR_PTR (- ENOMEM );
1219+ if (!cpd )
11741220 goto free_pd_blkcg ;
1175- }
1221+
11761222 blkcg -> cpd [i ] = cpd ;
11771223 cpd -> blkcg = blkcg ;
11781224 cpd -> plid = i ;
@@ -1196,12 +1242,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11961242 for (i -- ; i >= 0 ; i -- )
11971243 if (blkcg -> cpd [i ])
11981244 blkcg_policy [i ]-> cpd_free_fn (blkcg -> cpd [i ]);
1199-
1245+ free_percpu (blkcg -> lhead );
1246+ free_blkcg :
12001247 if (blkcg != & blkcg_root )
12011248 kfree (blkcg );
12021249unlock :
12031250 mutex_unlock (& blkcg_pol_mutex );
1204- return ret ;
1251+ return ERR_PTR ( - ENOMEM ) ;
12051252}
12061253
12071254static int blkcg_css_online (struct cgroup_subsys_state * css )
@@ -1944,6 +1991,7 @@ static int blk_cgroup_io_type(struct bio *bio)
19441991
19451992void blk_cgroup_bio_start (struct bio * bio )
19461993{
1994+ struct blkcg * blkcg = bio -> bi_blkg -> blkcg ;
19471995 int rwd = blk_cgroup_io_type (bio ), cpu ;
19481996 struct blkg_iostat_set * bis ;
19491997 unsigned long flags ;
@@ -1962,9 +2010,21 @@ void blk_cgroup_bio_start(struct bio *bio)
19622010 }
19632011 bis -> cur .ios [rwd ]++ ;
19642012
2013+ /*
2014+ * If the iostat_cpu isn't in a lockless list, put it into the
2015+ * list to indicate that a stat update is pending.
2016+ */
2017+ if (!READ_ONCE (bis -> lqueued )) {
2018+ struct llist_head * lhead = this_cpu_ptr (blkcg -> lhead );
2019+
2020+ llist_add (& bis -> lnode , lhead );
2021+ WRITE_ONCE (bis -> lqueued , true);
2022+ percpu_ref_get (& bis -> blkg -> refcnt );
2023+ }
2024+
19652025 u64_stats_update_end_irqrestore (& bis -> sync , flags );
19662026 if (cgroup_subsys_on_dfl (io_cgrp_subsys ))
1967- cgroup_rstat_updated (bio -> bi_blkg -> blkcg -> css .cgroup , cpu );
2027+ cgroup_rstat_updated (blkcg -> css .cgroup , cpu );
19682028 put_cpu ();
19692029}
19702030
0 commit comments