Skip to content

Commit d7974b6

Browse files
committed
blk-cgroup: Optimize blkcg_rstat_flush()
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2077665 commit 3b8cc62 Author: Waiman Long <longman@redhat.com> Date: Fri, 4 Nov 2022 20:59:01 -0400 blk-cgroup: Optimize blkcg_rstat_flush() For a system with many CPUs and block devices, the time to do blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It can be especially problematic as interrupt is disabled during the flush. It was reported that it might take seconds to complete in some extreme cases leading to hard lockup messages. As it is likely that not all the percpu blkg_iostat_set's has been updated since the last flush, those stale blkg_iostat_set's don't need to be flushed in this case. This patch optimizes blkcg_rstat_flush() by keeping a lockless list of recently updated blkg_iostat_set's in a newly added percpu blkcg->lhead pointer. The blkg_iostat_set is added to a lockless list on the update side in blk_cgroup_bio_start(). It is removed from the lockless list when flushed in blkcg_rstat_flush(). Due to racing, it is possible that blk_iostat_set's in the lockless list may have no new IO stats to be flushed, but that is OK. To protect against destruction of blkg, a percpu reference is gotten when putting into the lockless list and put back when removed. When booting up an instrumented test kernel with this patch on a 2-socket 96-thread system with cgroup v2, out of the 2051 calls to cgroup_rstat_flush() after bootup, 1788 of the calls were exited immediately because of empty lockless list. After an all-cpu kernel build, the ratio became 6295424/6340513. That was more than 99%. Signed-off-by: Waiman Long <longman@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20221105005902.407297-3-longman@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: Waiman Long <longman@redhat.com>
1 parent 4b0de36 commit d7974b6

File tree

2 files changed

+80
-6
lines changed

2 files changed

+80
-6
lines changed

block/blk-cgroup.c

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
6060

6161
#define BLKG_DESTROY_BATCH_SIZE 64
6262

63+
/*
64+
* Lockless lists for tracking IO stats update
65+
*
66+
* New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
67+
* There are multiple blkg's (one for each block device) attached to each
68+
* blkcg. The rstat code keeps track of which cpu has IO stats updated,
69+
* but it doesn't know which blkg has the updated stats. If there are many
70+
* block devices in a system, the cost of iterating all the blkg's to flush
71+
* out the IO stats can be high. To reduce such overhead, a set of percpu
72+
* lockless lists (lhead) per blkcg are used to track the set of recently
73+
* updated iostat_cpu's since the last flush. An iostat_cpu will be put
74+
* onto the lockless list on the update side [blk_cgroup_bio_start()] if
75+
* not there yet and then removed when being flushed [blkcg_rstat_flush()].
76+
* References to blkg are gotten and then put back in the process to
77+
* protect against blkg removal.
78+
*
79+
* Return: 0 if successful or -ENOMEM if allocation fails.
80+
*/
81+
static int init_blkcg_llists(struct blkcg *blkcg)
82+
{
83+
int cpu;
84+
85+
blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
86+
if (!blkcg->lhead)
87+
return -ENOMEM;
88+
89+
for_each_possible_cpu(cpu)
90+
init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
91+
return 0;
92+
}
93+
6394
/**
6495
* blkcg_css - find the current css
6596
*
@@ -237,8 +268,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
237268
blkg->blkcg = blkcg;
238269

239270
u64_stats_init(&blkg->iostat.sync);
240-
for_each_possible_cpu(cpu)
271+
for_each_possible_cpu(cpu) {
241272
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
273+
per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
274+
}
242275

243276
for (i = 0; i < BLKCG_MAX_POLS; i++) {
244277
struct blkcg_policy *pol = blkcg_policy[i];
@@ -828,20 +861,31 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
828861
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
829862
{
830863
struct blkcg *blkcg = css_to_blkcg(css);
831-
struct blkcg_gq *blkg;
864+
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
865+
struct llist_node *lnode;
866+
struct blkg_iostat_set *bisc, *next_bisc;
832867

833868
/* Root-level stats are sourced from system-wide IO stats */
834869
if (!cgroup_parent(css->cgroup))
835870
return;
836871

837872
rcu_read_lock();
838873

839-
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
874+
lnode = llist_del_all(lhead);
875+
if (!lnode)
876+
goto out;
877+
878+
/*
879+
* Iterate only the iostat_cpu's queued in the lockless list.
880+
*/
881+
llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
882+
struct blkcg_gq *blkg = bisc->blkg;
840883
struct blkcg_gq *parent = blkg->parent;
841-
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
842884
struct blkg_iostat cur;
843885
unsigned int seq;
844886

887+
WRITE_ONCE(bisc->lqueued, false);
888+
845889
/* fetch the current per-cpu values */
846890
do {
847891
seq = u64_stats_fetch_begin(&bisc->sync);
@@ -854,8 +898,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
854898
if (parent && parent->parent)
855899
blkcg_iostat_update(parent, &blkg->iostat.cur,
856900
&blkg->iostat.last);
901+
percpu_ref_put(&blkg->refcnt);
857902
}
858903

904+
out:
859905
rcu_read_unlock();
860906
}
861907

@@ -1133,6 +1179,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
11331179

11341180
mutex_unlock(&blkcg_pol_mutex);
11351181

1182+
free_percpu(blkcg->lhead);
11361183
kfree(blkcg);
11371184
}
11381185

@@ -1152,6 +1199,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11521199
goto unlock;
11531200
}
11541201

1202+
if (init_blkcg_llists(blkcg))
1203+
goto free_blkcg;
1204+
11551205
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
11561206
struct blkcg_policy *pol = blkcg_policy[i];
11571207
struct blkcg_policy_data *cpd;
@@ -1192,7 +1242,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11921242
for (i--; i >= 0; i--)
11931243
if (blkcg->cpd[i])
11941244
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1195-
1245+
free_percpu(blkcg->lhead);
1246+
free_blkcg:
11961247
if (blkcg != &blkcg_root)
11971248
kfree(blkcg);
11981249
unlock:
@@ -1940,6 +1991,7 @@ static int blk_cgroup_io_type(struct bio *bio)
19401991

19411992
void blk_cgroup_bio_start(struct bio *bio)
19421993
{
1994+
struct blkcg *blkcg = bio->bi_blkg->blkcg;
19431995
int rwd = blk_cgroup_io_type(bio), cpu;
19441996
struct blkg_iostat_set *bis;
19451997
unsigned long flags;
@@ -1958,9 +2010,21 @@ void blk_cgroup_bio_start(struct bio *bio)
19582010
}
19592011
bis->cur.ios[rwd]++;
19602012

2013+
/*
2014+
* If the iostat_cpu isn't in a lockless list, put it into the
2015+
* list to indicate that a stat update is pending.
2016+
*/
2017+
if (!READ_ONCE(bis->lqueued)) {
2018+
struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
2019+
2020+
llist_add(&bis->lnode, lhead);
2021+
WRITE_ONCE(bis->lqueued, true);
2022+
percpu_ref_get(&bis->blkg->refcnt);
2023+
}
2024+
19612025
u64_stats_update_end_irqrestore(&bis->sync, flags);
19622026
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1963-
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
2027+
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
19642028
put_cpu();
19652029
}
19662030

block/blk-cgroup.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/cgroup.h>
1919
#include <linux/kthread.h>
2020
#include <linux/blk-mq.h>
21+
#include <linux/llist.h>
2122

2223
struct blkcg_gq;
2324
struct blkg_policy_data;
@@ -43,6 +44,9 @@ struct blkg_iostat {
4344

4445
struct blkg_iostat_set {
4546
struct u64_stats_sync sync;
47+
struct blkcg_gq *blkg;
48+
struct llist_node lnode;
49+
int lqueued; /* queued in llist */
4650
struct blkg_iostat cur;
4751
struct blkg_iostat last;
4852
};
@@ -97,6 +101,12 @@ struct blkcg {
97101
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
98102

99103
struct list_head all_blkcgs_node;
104+
105+
/*
106+
* List of updated percpu blkg_iostat_set's since the last flush.
107+
*/
108+
struct llist_head __percpu *lhead;
109+
100110
#ifdef CONFIG_BLK_CGROUP_FC_APPID
101111
char fc_app_id[FC_APPID_LEN];
102112
#endif

0 commit comments

Comments
 (0)