|
| 1 | +sbitmap: correct wake_batch recalculation to avoid potential IO hung |
| 2 | + |
| 3 | +jira LE-4066 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.72.1.el8_10 |
| 5 | +commit-author Kemeng Shi <shikemeng@huaweicloud.com> |
| 6 | +commit b5fcf7871acb7f9a3a8ed341a68bd86aba3e254a |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.72.1.el8_10/b5fcf787.failed |
| 10 | + |
| 11 | +Commit 180dccb0dba4f ("blk-mq: fix tag_get wait task can't be awakened") |
| 12 | +mentioned that in case of shared tags, there could be just one real |
| 13 | +active hctx(queue) because of lazy detection of tag idle. Then driver tag |
| 14 | +allocation may wait forever on this real active hctx(queue) if wake_batch |
| 15 | +is > hctx_max_depth where hctx_max_depth is available tags depth for the |
| 16 | +actve hctx(queue). However, the condition wake_batch > hctx_max_depth is |
| 17 | +not strong enough to avoid IO hung as the sbitmap_queue_wake_up will only |
| 18 | +wake up one wait queue for each wake_batch even though there is only one |
| 19 | +waiter in the woken wait queue. After this, there is only one tag to free |
| 20 | +and wake_batch may not be reached anymore. Commit 180dccb0dba4f ("blk-mq: |
| 21 | +fix tag_get wait task can't be awakened") methioned that driver tag |
| 22 | +allocation may wait forever. Actually, the inactive hctx(queue) will be |
| 23 | +truely idle after at most 30 seconds and will call blk_mq_tag_wakeup_all |
| 24 | +to wake one waiter per wait queue to break the hung. But IO hung for 30 |
| 25 | +seconds is also not acceptable. Set batch size to small enough that depth |
| 26 | +of the shared hctx(queue) is enough to wake up all of the queues like |
| 27 | +sbq_calc_wake_batch do to fix this potential IO hung. |
| 28 | + |
| 29 | +Although hctx_max_depth will be clamped to at least 4 while wake_batch |
| 30 | +recalculation does not do the clamp, the wake_batch will be always |
| 31 | +recalculated to 1 when hctx_max_depth <= 4. |
| 32 | + |
| 33 | +Fixes: 180dccb0dba4 ("blk-mq: fix tag_get wait task can't be awakened") |
| 34 | + Reviewed-by: Jan Kara <jack@suse.cz> |
| 35 | + Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com> |
| 36 | +Link: https://lore.kernel.org/r/20230116205059.3821738-6-shikemeng@huaweicloud.com |
| 37 | + Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| 38 | +(cherry picked from commit b5fcf7871acb7f9a3a8ed341a68bd86aba3e254a) |
| 39 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 40 | + |
| 41 | +# Conflicts: |
| 42 | +# lib/sbitmap.c |
| 43 | +diff --cc lib/sbitmap.c |
| 44 | +index 082298015b44,eff4e42c425a..000000000000 |
| 45 | +--- a/lib/sbitmap.c |
| 46 | ++++ b/lib/sbitmap.c |
| 47 | +@@@ -468,22 -446,26 +468,38 @@@ EXPORT_SYMBOL_GPL(sbitmap_queue_init_no |
| 48 | + static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, |
| 49 | + unsigned int depth) |
| 50 | + { |
| 51 | + - unsigned int wake_batch; |
| 52 | + + unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); |
| 53 | + + int i; |
| 54 | + |
| 55 | + - wake_batch = sbq_calc_wake_batch(sbq, depth); |
| 56 | + - if (sbq->wake_batch != wake_batch) |
| 57 | + + if (sbq->wake_batch != wake_batch) { |
| 58 | + WRITE_ONCE(sbq->wake_batch, wake_batch); |
| 59 | + + /* |
| 60 | + + * Pairs with the memory barrier in sbitmap_queue_wake_up() |
| 61 | + + * to ensure that the batch size is updated before the wait |
| 62 | + + * counts. |
| 63 | + + */ |
| 64 | + + smp_mb(); |
| 65 | + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) |
| 66 | + + atomic_set(&sbq->ws[i].wait_cnt, 1); |
| 67 | + + } |
| 68 | + } |
| 69 | + |
| 70 | +++<<<<<<< HEAD |
| 71 | +++======= |
| 72 | ++ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, |
| 73 | ++ unsigned int users) |
| 74 | ++ { |
| 75 | ++ unsigned int wake_batch; |
| 76 | ++ unsigned int depth = (sbq->sb.depth + users - 1) / users; |
| 77 | ++ |
| 78 | ++ wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, |
| 79 | ++ 1, SBQ_WAKE_BATCH); |
| 80 | ++ |
| 81 | ++ WRITE_ONCE(sbq->wake_batch, wake_batch); |
| 82 | ++ } |
| 83 | ++ EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); |
| 84 | ++ |
| 85 | +++>>>>>>> b5fcf7871acb (sbitmap: correct wake_batch recalculation to avoid potential IO hung) |
| 86 | + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) |
| 87 | + { |
| 88 | + sbitmap_queue_update_wake_batch(sbq, depth); |
| 89 | +* Unmerged path lib/sbitmap.c |
0 commit comments