|
| 1 | +sbitmap: fix batching wakeup |
| 2 | + |
| 3 | +jira LE-4034 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.72.1.el8_10 |
| 5 | +commit-author David Jeffery <djeffery@redhat.com> |
| 6 | +commit 106397376c0369fcc01c58dd189ff925a2724a57 |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.72.1.el8_10/10639737.failed |
| 10 | + |
| 11 | +Current code supposes that it is enough to provide forward progress by |
| 12 | +just waking up one wait queue after one completion batch is done. |
| 13 | + |
| 14 | +Unfortunately this way isn't enough, cause waiter can be added to wait |
| 15 | +queue just after it is woken up. |
| 16 | + |
| 17 | +Follows one example(64 depth, wake_batch is 8) |
| 18 | + |
| 19 | +1) all 64 tags are active |
| 20 | + |
| 21 | +2) in each wait queue, there is only one single waiter |
| 22 | + |
| 23 | +3) each time one completion batch(8 completions) wakes up just one |
| 24 | + waiter in each wait queue, then immediately one new sleeper is added |
| 25 | + to this wait queue |
| 26 | + |
| 27 | +4) after 64 completions, 8 waiters are wakeup, and there are still 8 |
| 28 | + waiters in each wait queue |
| 29 | + |
| 30 | +5) after another 8 active tags are completed, only one waiter can be |
| 31 | + wakeup, and the other 7 can't be waken up anymore. |
| 32 | + |
| 33 | +Turns out it isn't easy to fix this problem, so simply wakeup enough |
| 34 | +waiters for single batch. |
| 35 | + |
| 36 | + Cc: Kemeng Shi <shikemeng@huaweicloud.com> |
| 37 | + Cc: Chengming Zhou <zhouchengming@bytedance.com> |
| 38 | + Cc: Jan Kara <jack@suse.cz> |
| 39 | + Signed-off-by: David Jeffery <djeffery@redhat.com> |
| 40 | + Signed-off-by: Ming Lei <ming.lei@redhat.com> |
| 41 | + Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de> |
| 42 | + Reviewed-by: Keith Busch <kbusch@kernel.org> |
| 43 | +Link: https://lore.kernel.org/r/20230721095715.232728-1-ming.lei@redhat.com |
| 44 | + Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| 45 | +(cherry picked from commit 106397376c0369fcc01c58dd189ff925a2724a57) |
| 46 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 47 | + |
| 48 | +# Conflicts: |
| 49 | +# lib/sbitmap.c |
| 50 | +diff --cc lib/sbitmap.c |
| 51 | +index 082298015b44,d0a5081dfd12..000000000000 |
| 52 | +--- a/lib/sbitmap.c |
| 53 | ++++ b/lib/sbitmap.c |
| 54 | +@@@ -566,106 -548,55 +566,115 @@@ void sbitmap_queue_min_shallow_depth(st |
| 55 | + } |
| 56 | + EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); |
| 57 | + |
| 58 | + -static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) |
| 59 | + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) |
| 60 | + { |
| 61 | +- int i, wake_index; |
| 62 | ++ int i, wake_index, woken; |
| 63 | + |
| 64 | + if (!atomic_read(&sbq->ws_active)) |
| 65 | + - return; |
| 66 | + + return NULL; |
| 67 | + |
| 68 | + wake_index = atomic_read(&sbq->wake_index); |
| 69 | + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { |
| 70 | + struct sbq_wait_state *ws = &sbq->ws[wake_index]; |
| 71 | + |
| 72 | + - /* |
| 73 | + - * Advance the index before checking the current queue. |
| 74 | + - * It improves fairness, by ensuring the queue doesn't |
| 75 | + - * need to be fully emptied before trying to wake up |
| 76 | + - * from the next one. |
| 77 | + - */ |
| 78 | + - wake_index = sbq_index_inc(wake_index); |
| 79 | + + if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { |
| 80 | + + if (wake_index != atomic_read(&sbq->wake_index)) |
| 81 | + + atomic_set(&sbq->wake_index, wake_index); |
| 82 | + + return ws; |
| 83 | + + } |
| 84 | + |
| 85 | +++<<<<<<< HEAD |
| 86 | + + wake_index = sbq_index_inc(wake_index); |
| 87 | +++======= |
| 88 | ++ if (waitqueue_active(&ws->wait)) { |
| 89 | ++ woken = wake_up_nr(&ws->wait, nr); |
| 90 | ++ if (woken == nr) |
| 91 | ++ break; |
| 92 | ++ nr -= woken; |
| 93 | ++ } |
| 94 | +++>>>>>>> 106397376c03 (sbitmap: fix batching wakeup) |
| 95 | + } |
| 96 | + |
| 97 | + - if (wake_index != atomic_read(&sbq->wake_index)) |
| 98 | + - atomic_set(&sbq->wake_index, wake_index); |
| 99 | + + return NULL; |
| 100 | + } |
| 101 | + |
| 102 | + -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) |
| 103 | + +static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) |
| 104 | + { |
| 105 | + - unsigned int wake_batch = READ_ONCE(sbq->wake_batch); |
| 106 | + - unsigned int wakeups; |
| 107 | + + struct sbq_wait_state *ws; |
| 108 | + + unsigned int wake_batch; |
| 109 | + + int wait_cnt, cur, sub; |
| 110 | + + bool ret; |
| 111 | + |
| 112 | + - if (!atomic_read(&sbq->ws_active)) |
| 113 | + - return; |
| 114 | + + if (*nr <= 0) |
| 115 | + + return false; |
| 116 | + |
| 117 | + - atomic_add(nr, &sbq->completion_cnt); |
| 118 | + - wakeups = atomic_read(&sbq->wakeup_cnt); |
| 119 | + + ws = sbq_wake_ptr(sbq); |
| 120 | + + if (!ws) |
| 121 | + + return false; |
| 122 | + |
| 123 | + + cur = atomic_read(&ws->wait_cnt); |
| 124 | + do { |
| 125 | + - if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) |
| 126 | + - return; |
| 127 | + - } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, |
| 128 | + - &wakeups, wakeups + wake_batch)); |
| 129 | + + /* |
| 130 | + + * For concurrent callers of this, callers should call this |
| 131 | + + * function again to wakeup a new batch on a different 'ws'. |
| 132 | + + */ |
| 133 | + + if (cur == 0) |
| 134 | + + return true; |
| 135 | + + sub = min(*nr, cur); |
| 136 | + + wait_cnt = cur - sub; |
| 137 | + + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); |
| 138 | + + |
| 139 | + + /* |
| 140 | + + * If we decremented queue without waiters, retry to avoid lost |
| 141 | + + * wakeups. |
| 142 | + + */ |
| 143 | + + if (wait_cnt > 0) |
| 144 | + + return !waitqueue_active(&ws->wait); |
| 145 | + + |
| 146 | + + *nr -= sub; |
| 147 | + + |
| 148 | + + /* |
| 149 | + + * When wait_cnt == 0, we have to be particularly careful as we are |
| 150 | + + * responsible to reset wait_cnt regardless whether we've actually |
| 151 | + + * woken up anybody. But in case we didn't wakeup anybody, we still |
| 152 | + + * need to retry. |
| 153 | + + */ |
| 154 | + + ret = !waitqueue_active(&ws->wait); |
| 155 | + + wake_batch = READ_ONCE(sbq->wake_batch); |
| 156 | + + |
| 157 | + + /* |
| 158 | + + * Wake up first in case that concurrent callers decrease wait_cnt |
| 159 | + + * while waitqueue is empty. |
| 160 | + + */ |
| 161 | + + wake_up_nr(&ws->wait, wake_batch); |
| 162 | + + |
| 163 | + + /* |
| 164 | + + * Pairs with the memory barrier in sbitmap_queue_resize() to |
| 165 | + + * ensure that we see the batch size update before the wait |
| 166 | + + * count is reset. |
| 167 | + + * |
| 168 | + + * Also pairs with the implicit barrier between decrementing wait_cnt |
| 169 | + + * and checking for waitqueue_active() to make sure waitqueue_active() |
| 170 | + + * sees result of the wakeup if atomic_dec_return() has seen the result |
| 171 | + + * of atomic_set(). |
| 172 | + + */ |
| 173 | + + smp_mb__before_atomic(); |
| 174 | + + |
| 175 | + + /* |
| 176 | + + * Increase wake_index before updating wait_cnt, otherwise concurrent |
| 177 | + + * callers can see valid wait_cnt in old waitqueue, which can cause |
| 178 | + + * invalid wakeup on the old waitqueue. |
| 179 | + + */ |
| 180 | + + sbq_index_atomic_inc(&sbq->wake_index); |
| 181 | + + atomic_set(&ws->wait_cnt, wake_batch); |
| 182 | + + |
| 183 | + + return ret || *nr; |
| 184 | + +} |
| 185 | + |
| 186 | + - __sbitmap_queue_wake_up(sbq, wake_batch); |
| 187 | + +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) |
| 188 | + +{ |
| 189 | + + while (__sbq_wake_up(sbq, &nr)) |
| 190 | + + ; |
| 191 | + } |
| 192 | + EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); |
| 193 | + |
| 194 | +* Unmerged path lib/sbitmap.c |
0 commit comments