|
| 1 | +sbitmap: fix io hung due to race on sbitmap_word::cleared |
| 2 | + |
| 3 | +jira LE-4066 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.72.1.el8_10 |
| 5 | +commit-author Yang Yang <yang.yang@vivo.com> |
| 6 | +commit 72d04bdcf3f7d7e07d82f9757946f68802a7270a |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.72.1.el8_10/72d04bdc.failed |
| 10 | + |
| 11 | +Configuration for sbq: |
| 12 | + depth=64, wake_batch=6, shift=6, map_nr=1 |
| 13 | + |
| 14 | +1. There are 64 requests in progress: |
| 15 | + map->word = 0xFFFFFFFFFFFFFFFF |
| 16 | +2. After all the 64 requests complete, and no more requests come: |
| 17 | + map->word = 0xFFFFFFFFFFFFFFFF, map->cleared = 0xFFFFFFFFFFFFFFFF |
| 18 | +3. Now two tasks try to allocate requests: |
| 19 | + T1: T2: |
| 20 | + __blk_mq_get_tag . |
| 21 | + __sbitmap_queue_get . |
| 22 | + sbitmap_get . |
| 23 | + sbitmap_find_bit . |
| 24 | + sbitmap_find_bit_in_word . |
| 25 | + __sbitmap_get_word -> nr=-1 __blk_mq_get_tag |
| 26 | + sbitmap_deferred_clear __sbitmap_queue_get |
| 27 | + /* map->cleared=0xFFFFFFFFFFFFFFFF */ sbitmap_find_bit |
| 28 | + if (!READ_ONCE(map->cleared)) sbitmap_find_bit_in_word |
| 29 | + return false; __sbitmap_get_word -> nr=-1 |
| 30 | + mask = xchg(&map->cleared, 0) sbitmap_deferred_clear |
| 31 | + atomic_long_andnot() /* map->cleared=0 */ |
| 32 | + if (!(map->cleared)) |
| 33 | + return false; |
| 34 | + /* |
| 35 | + * map->cleared is cleared by T1 |
| 36 | + * T2 fail to acquire the tag |
| 37 | + */ |
| 38 | + |
| 39 | +4. T2 is the sole tag waiter. When T1 puts the tag, T2 cannot be woken |
| 40 | +up due to the wake_batch being set at 6. If no more requests come, T1 |
| 41 | +will wait here indefinitely. |
| 42 | + |
| 43 | +This patch achieves two purposes: |
| 44 | +1. Check on ->cleared and update on both ->cleared and ->word need to |
| 45 | +be done atomically, and using spinlock could be the simplest solution. |
| 46 | +2. Add extra check in sbitmap_deferred_clear(), to identify whether |
| 47 | +->word has free bits. |
| 48 | + |
| 49 | +Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits") |
| 50 | + Signed-off-by: Yang Yang <yang.yang@vivo.com> |
| 51 | + Reviewed-by: Ming Lei <ming.lei@redhat.com> |
| 52 | + Reviewed-by: Bart Van Assche <bvanassche@acm.org> |
| 53 | +Link: https://lore.kernel.org/r/20240716082644.659566-1-yang.yang@vivo.com |
| 54 | + Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| 55 | +(cherry picked from commit 72d04bdcf3f7d7e07d82f9757946f68802a7270a) |
| 56 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 57 | + |
| 58 | +# Conflicts: |
| 59 | +# include/linux/sbitmap.h |
| 60 | +diff --cc include/linux/sbitmap.h |
| 61 | +index c5364c7c3ae5,c09cdcc99471..000000000000 |
| 62 | +--- a/include/linux/sbitmap.h |
| 63 | ++++ b/include/linux/sbitmap.h |
| 64 | +@@@ -40,9 -38,9 +40,15 @@@ struct sbitmap_word |
| 65 | + unsigned long cleared ____cacheline_aligned_in_smp; |
| 66 | + |
| 67 | + /** |
| 68 | +++<<<<<<< HEAD |
| 69 | + + * @swap_lock: Held while swapping word <-> cleared |
| 70 | + + */ |
| 71 | + + RH_KABI_DEPRECATE(spinlock_t, swap_lock) |
| 72 | +++======= |
| 73 | ++ * @swap_lock: serializes simultaneous updates of ->word and ->cleared |
| 74 | ++ */ |
| 75 | ++ spinlock_t swap_lock; |
| 76 | +++>>>>>>> 72d04bdcf3f7 (sbitmap: fix io hung due to race on sbitmap_word::cleared) |
| 77 | + } ____cacheline_aligned_in_smp; |
| 78 | + |
| 79 | + /** |
| 80 | +* Unmerged path include/linux/sbitmap.h |
| 81 | +diff --git a/lib/sbitmap.c b/lib/sbitmap.c |
| 82 | +index 59f985b64b05..8da65013f38f 100644 |
| 83 | +--- a/lib/sbitmap.c |
| 84 | ++++ b/lib/sbitmap.c |
| 85 | +@@ -76,12 +76,30 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb, |
| 86 | + /* |
| 87 | + * See if we have deferred clears that we can batch move |
| 88 | + */ |
| 89 | +-static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) |
| 90 | ++static inline bool sbitmap_deferred_clear(struct sbitmap_word *map, |
| 91 | ++ unsigned int depth, unsigned int alloc_hint, bool wrap) |
| 92 | + { |
| 93 | +- unsigned long mask; |
| 94 | ++ unsigned long mask, word_mask; |
| 95 | + |
| 96 | +- if (!READ_ONCE(map->cleared)) |
| 97 | +- return false; |
| 98 | ++ guard(spinlock_irqsave)(&map->swap_lock); |
| 99 | ++ |
| 100 | ++ if (!map->cleared) { |
| 101 | ++ if (depth == 0) |
| 102 | ++ return false; |
| 103 | ++ |
| 104 | ++ word_mask = (~0UL) >> (BITS_PER_LONG - depth); |
| 105 | ++ /* |
| 106 | ++ * The current behavior is to always retry after moving |
| 107 | ++ * ->cleared to word, and we change it to retry in case |
| 108 | ++ * of any free bits. To avoid an infinite loop, we need |
| 109 | ++ * to take wrap & alloc_hint into account, otherwise a |
| 110 | ++ * soft lockup may occur. |
| 111 | ++ */ |
| 112 | ++ if (!wrap && alloc_hint) |
| 113 | ++ word_mask &= ~((1UL << alloc_hint) - 1); |
| 114 | ++ |
| 115 | ++ return (READ_ONCE(map->word) & word_mask) != word_mask; |
| 116 | ++ } |
| 117 | + |
| 118 | + /* |
| 119 | + * First get a stable cleared mask, setting the old mask to 0. |
| 120 | +@@ -101,6 +119,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, |
| 121 | + bool alloc_hint) |
| 122 | + { |
| 123 | + unsigned int bits_per_word; |
| 124 | ++ int i; |
| 125 | + |
| 126 | + if (shift < 0) |
| 127 | + shift = sbitmap_calculate_shift(depth); |
| 128 | +@@ -138,6 +157,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, |
| 129 | + *SB_ALLOC_HINT_PTR(sb) = NULL; |
| 130 | + } |
| 131 | + |
| 132 | ++ for (i = 0; i < sb->map_nr; i++) |
| 133 | ++ spin_lock_init(&sb->map[i].swap_lock); |
| 134 | ++ |
| 135 | + return 0; |
| 136 | + } |
| 137 | + EXPORT_SYMBOL_GPL(sbitmap_init_node); |
| 138 | +@@ -148,7 +170,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) |
| 139 | + unsigned int i; |
| 140 | + |
| 141 | + for (i = 0; i < sb->map_nr; i++) |
| 142 | +- sbitmap_deferred_clear(&sb->map[i]); |
| 143 | ++ sbitmap_deferred_clear(&sb->map[i], 0, 0, 0); |
| 144 | + |
| 145 | + sb->depth = depth; |
| 146 | + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); |
| 147 | +@@ -199,7 +221,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map, |
| 148 | + alloc_hint, wrap); |
| 149 | + if (nr != -1) |
| 150 | + break; |
| 151 | +- if (!sbitmap_deferred_clear(map)) |
| 152 | ++ if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap)) |
| 153 | + break; |
| 154 | + } while (1); |
| 155 | + |
| 156 | +@@ -519,7 +541,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, |
| 157 | + unsigned int map_depth = __map_depth(sb, index); |
| 158 | + unsigned long val; |
| 159 | + |
| 160 | +- sbitmap_deferred_clear(map); |
| 161 | ++ sbitmap_deferred_clear(map, 0, 0, 0); |
| 162 | + val = READ_ONCE(map->word); |
| 163 | + if (val == (1UL << (map_depth - 1)) - 1) |
| 164 | + goto next; |
0 commit comments