Skip to content

Commit 378e8d9

Browse files
author
Herton R. Krzesinski
committed
Merge: blk-cgroup: Fix potential lockup in blkcg_rstat_flush()
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/1861 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2077665 On system with a large number of block devices, it is possible that blkcg_rstat_flush() will take a long time to execute because of the time needed to iterate all the per-cpu blkg_iostat_set structures. This can lead to hard lockup in some extreme cases since interrupt was disabled in the iteration process. This is fixed by keeping track of the set of updated blkg_iostat_set structures in a lockless list and only iterate those in blkcg_rstat_flush(). The last 2 patches in the series does that. There are also some minor fixes to them, but they have not yet been merged upstream and so are not included. This MR also contains some other miscellaneous cgroup fixes and performance enhancements that may help to alleviate this particular problem. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Phil Auld <pauld@redhat.com> Approved-by: Rafael Aquini <aquini@redhat.com> Approved-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
2 parents f50e717 + d7974b6 commit 378e8d9

File tree

13 files changed

+382
-145
lines changed

13 files changed

+382
-145
lines changed

block/blk-cgroup-fc-appid.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len)
1919
return -EINVAL;
2020

2121
cgrp = cgroup_get_from_id(cgrp_id);
22-
if (!cgrp)
23-
return -ENOENT;
22+
if (IS_ERR(cgrp))
23+
return PTR_ERR(cgrp);
2424
css = cgroup_get_e_css(cgrp, &io_cgrp_subsys);
2525
if (!css) {
2626
ret = -ENOENT;

block/blk-cgroup.c

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
6060

6161
#define BLKG_DESTROY_BATCH_SIZE 64
6262

63+
/*
64+
* Lockless lists for tracking IO stats update
65+
*
66+
* New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
67+
* There are multiple blkg's (one for each block device) attached to each
68+
* blkcg. The rstat code keeps track of which cpu has IO stats updated,
69+
* but it doesn't know which blkg has the updated stats. If there are many
70+
* block devices in a system, the cost of iterating all the blkg's to flush
71+
* out the IO stats can be high. To reduce such overhead, a set of percpu
72+
* lockless lists (lhead) per blkcg are used to track the set of recently
73+
* updated iostat_cpu's since the last flush. An iostat_cpu will be put
74+
* onto the lockless list on the update side [blk_cgroup_bio_start()] if
75+
* not there yet and then removed when being flushed [blkcg_rstat_flush()].
76+
* References to blkg are gotten and then put back in the process to
77+
* protect against blkg removal.
78+
*
79+
* Return: 0 if successful or -ENOMEM if allocation fails.
80+
*/
81+
static int init_blkcg_llists(struct blkcg *blkcg)
82+
{
83+
int cpu;
84+
85+
blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
86+
if (!blkcg->lhead)
87+
return -ENOMEM;
88+
89+
for_each_possible_cpu(cpu)
90+
init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
91+
return 0;
92+
}
93+
6394
/**
6495
* blkcg_css - find the current css
6596
*
@@ -237,8 +268,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
237268
blkg->blkcg = blkcg;
238269

239270
u64_stats_init(&blkg->iostat.sync);
240-
for_each_possible_cpu(cpu)
271+
for_each_possible_cpu(cpu) {
241272
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
273+
per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
274+
}
242275

243276
for (i = 0; i < BLKCG_MAX_POLS; i++) {
244277
struct blkcg_policy *pol = blkcg_policy[i];
@@ -828,20 +861,31 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
828861
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
829862
{
830863
struct blkcg *blkcg = css_to_blkcg(css);
831-
struct blkcg_gq *blkg;
864+
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
865+
struct llist_node *lnode;
866+
struct blkg_iostat_set *bisc, *next_bisc;
832867

833868
/* Root-level stats are sourced from system-wide IO stats */
834869
if (!cgroup_parent(css->cgroup))
835870
return;
836871

837872
rcu_read_lock();
838873

839-
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
874+
lnode = llist_del_all(lhead);
875+
if (!lnode)
876+
goto out;
877+
878+
/*
879+
* Iterate only the iostat_cpu's queued in the lockless list.
880+
*/
881+
llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
882+
struct blkcg_gq *blkg = bisc->blkg;
840883
struct blkcg_gq *parent = blkg->parent;
841-
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
842884
struct blkg_iostat cur;
843885
unsigned int seq;
844886

887+
WRITE_ONCE(bisc->lqueued, false);
888+
845889
/* fetch the current per-cpu values */
846890
do {
847891
seq = u64_stats_fetch_begin(&bisc->sync);
@@ -854,8 +898,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
854898
if (parent && parent->parent)
855899
blkcg_iostat_update(parent, &blkg->iostat.cur,
856900
&blkg->iostat.last);
901+
percpu_ref_put(&blkg->refcnt);
857902
}
858903

904+
out:
859905
rcu_read_unlock();
860906
}
861907

@@ -1133,14 +1179,14 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
11331179

11341180
mutex_unlock(&blkcg_pol_mutex);
11351181

1182+
free_percpu(blkcg->lhead);
11361183
kfree(blkcg);
11371184
}
11381185

11391186
static struct cgroup_subsys_state *
11401187
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11411188
{
11421189
struct blkcg *blkcg;
1143-
struct cgroup_subsys_state *ret;
11441190
int i;
11451191

11461192
mutex_lock(&blkcg_pol_mutex);
@@ -1149,12 +1195,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11491195
blkcg = &blkcg_root;
11501196
} else {
11511197
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1152-
if (!blkcg) {
1153-
ret = ERR_PTR(-ENOMEM);
1198+
if (!blkcg)
11541199
goto unlock;
1155-
}
11561200
}
11571201

1202+
if (init_blkcg_llists(blkcg))
1203+
goto free_blkcg;
1204+
11581205
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
11591206
struct blkcg_policy *pol = blkcg_policy[i];
11601207
struct blkcg_policy_data *cpd;
@@ -1169,10 +1216,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11691216
continue;
11701217

11711218
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1172-
if (!cpd) {
1173-
ret = ERR_PTR(-ENOMEM);
1219+
if (!cpd)
11741220
goto free_pd_blkcg;
1175-
}
1221+
11761222
blkcg->cpd[i] = cpd;
11771223
cpd->blkcg = blkcg;
11781224
cpd->plid = i;
@@ -1196,12 +1242,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
11961242
for (i--; i >= 0; i--)
11971243
if (blkcg->cpd[i])
11981244
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1199-
1245+
free_percpu(blkcg->lhead);
1246+
free_blkcg:
12001247
if (blkcg != &blkcg_root)
12011248
kfree(blkcg);
12021249
unlock:
12031250
mutex_unlock(&blkcg_pol_mutex);
1204-
return ret;
1251+
return ERR_PTR(-ENOMEM);
12051252
}
12061253

12071254
static int blkcg_css_online(struct cgroup_subsys_state *css)
@@ -1944,6 +1991,7 @@ static int blk_cgroup_io_type(struct bio *bio)
19441991

19451992
void blk_cgroup_bio_start(struct bio *bio)
19461993
{
1994+
struct blkcg *blkcg = bio->bi_blkg->blkcg;
19471995
int rwd = blk_cgroup_io_type(bio), cpu;
19481996
struct blkg_iostat_set *bis;
19491997
unsigned long flags;
@@ -1962,9 +2010,21 @@ void blk_cgroup_bio_start(struct bio *bio)
19622010
}
19632011
bis->cur.ios[rwd]++;
19642012

2013+
/*
2014+
* If the iostat_cpu isn't in a lockless list, put it into the
2015+
* list to indicate that a stat update is pending.
2016+
*/
2017+
if (!READ_ONCE(bis->lqueued)) {
2018+
struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
2019+
2020+
llist_add(&bis->lnode, lhead);
2021+
WRITE_ONCE(bis->lqueued, true);
2022+
percpu_ref_get(&bis->blkg->refcnt);
2023+
}
2024+
19652025
u64_stats_update_end_irqrestore(&bis->sync, flags);
19662026
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1967-
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
2027+
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
19682028
put_cpu();
19692029
}
19702030

block/blk-cgroup.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/cgroup.h>
1919
#include <linux/kthread.h>
2020
#include <linux/blk-mq.h>
21+
#include <linux/llist.h>
2122

2223
struct blkcg_gq;
2324
struct blkg_policy_data;
@@ -43,6 +44,9 @@ struct blkg_iostat {
4344

4445
struct blkg_iostat_set {
4546
struct u64_stats_sync sync;
47+
struct blkcg_gq *blkg;
48+
struct llist_node lnode;
49+
int lqueued; /* queued in llist */
4650
struct blkg_iostat cur;
4751
struct blkg_iostat last;
4852
};
@@ -97,6 +101,12 @@ struct blkcg {
97101
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
98102

99103
struct list_head all_blkcgs_node;
104+
105+
/*
106+
* List of updated percpu blkg_iostat_set's since the last flush.
107+
*/
108+
struct llist_head __percpu *lhead;
109+
100110
#ifdef CONFIG_BLK_CGROUP_FC_APPID
101111
char fc_app_id[FC_APPID_LEN];
102112
#endif

include/linux/cgroup-defs.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,8 @@ struct css_set {
264264
* List of csets participating in the on-going migration either as
265265
* source or destination. Protected by cgroup_mutex.
266266
*/
267-
struct list_head mg_preload_node;
267+
struct list_head mg_src_preload_node;
268+
struct list_head mg_dst_preload_node;
268269
struct list_head mg_node;
269270

270271
/*
@@ -417,7 +418,7 @@ struct cgroup {
417418
/*
418419
* The bitmask of subsystems enabled on the child cgroups.
419420
* ->subtree_control is the one configured through
420-
* "cgroup.subtree_control" while ->child_ss_mask is the effective
421+
* "cgroup.subtree_control" while ->subtree_ss_mask is the effective
421422
* one which may have more subsystems enabled. Controller knobs
422423
* are made available iff it's enabled in ->subtree_control.
423424
*/

include/linux/cgroup.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -752,11 +752,6 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
752752

753753
static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
754754
{}
755-
756-
static inline struct cgroup *cgroup_get_from_id(u64 id)
757-
{
758-
return NULL;
759-
}
760755
#endif /* !CONFIG_CGROUPS */
761756

762757
#ifdef CONFIG_CGROUPS

include/linux/memcontrol.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
832832
}
833833
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
834834

835+
#ifdef CONFIG_SHRINKER_DEBUG
836+
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
837+
{
838+
return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
839+
}
840+
841+
struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
842+
#endif
843+
835844
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
836845
{
837846
return mem_cgroup_from_css(seq_css(m));
@@ -1338,6 +1347,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
13381347
return NULL;
13391348
}
13401349

1350+
#ifdef CONFIG_SHRINKER_DEBUG
1351+
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
1352+
{
1353+
return 0;
1354+
}
1355+
1356+
static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
1357+
{
1358+
return NULL;
1359+
}
1360+
#endif
1361+
13411362
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
13421363
{
13431364
return NULL;

include/trace/events/cgroup.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ DECLARE_EVENT_CLASS(cgroup,
5959

6060
TP_STRUCT__entry(
6161
__field( int, root )
62-
__field( int, id )
6362
__field( int, level )
63+
__field( u64, id )
6464
__string( path, path )
6565
),
6666

@@ -71,7 +71,7 @@ DECLARE_EVENT_CLASS(cgroup,
7171
__assign_str(path, path);
7272
),
7373

74-
TP_printk("root=%d id=%d level=%d path=%s",
74+
TP_printk("root=%d id=%llu level=%d path=%s",
7575
__entry->root, __entry->id, __entry->level, __get_str(path))
7676
);
7777

@@ -126,8 +126,8 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
126126

127127
TP_STRUCT__entry(
128128
__field( int, dst_root )
129-
__field( int, dst_id )
130129
__field( int, dst_level )
130+
__field( u64, dst_id )
131131
__field( int, pid )
132132
__string( dst_path, path )
133133
__string( comm, task->comm )
@@ -142,7 +142,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
142142
__assign_str(comm, task->comm);
143143
),
144144

145-
TP_printk("dst_root=%d dst_id=%d dst_level=%d dst_path=%s pid=%d comm=%s",
145+
TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
146146
__entry->dst_root, __entry->dst_id, __entry->dst_level,
147147
__get_str(dst_path), __entry->pid, __get_str(comm))
148148
);
@@ -171,8 +171,8 @@ DECLARE_EVENT_CLASS(cgroup_event,
171171

172172
TP_STRUCT__entry(
173173
__field( int, root )
174-
__field( int, id )
175174
__field( int, level )
175+
__field( u64, id )
176176
__string( path, path )
177177
__field( int, val )
178178
),
@@ -185,7 +185,7 @@ DECLARE_EVENT_CLASS(cgroup_event,
185185
__entry->val = val;
186186
),
187187

188-
TP_printk("root=%d id=%d level=%d path=%s val=%d",
188+
TP_printk("root=%d id=%llu level=%d path=%s val=%d",
189189
__entry->root, __entry->id, __entry->level, __get_str(path),
190190
__entry->val)
191191
);

kernel/cgroup/cgroup-v1.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
5959
int retval = 0;
6060

6161
mutex_lock(&cgroup_mutex);
62+
cpus_read_lock();
6263
percpu_down_write(&cgroup_threadgroup_rwsem);
6364
for_each_root(root) {
6465
struct cgroup *from_cgrp;
@@ -75,6 +76,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
7576
break;
7677
}
7778
percpu_up_write(&cgroup_threadgroup_rwsem);
79+
cpus_read_unlock();
7880
mutex_unlock(&cgroup_mutex);
7981

8082
return retval;

0 commit comments

Comments
 (0)